You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@spamassassin.apache.org by Giovanni Bechis <gi...@paclan.it> on 2021/10/14 14:15:13 UTC

new Pyzor implementation

Hi,
cPanel has developed a native Perl Pyzor implementation for SpamAssassin
and a diff against SpamAssassin 4.0 follows.
Atm I am using it in production on a small server, more tests and
opinions are welcome.

Original cPanel code is at https://metacpan.org/pod/Mail::Pyzor.

 Cheers
  Giovanni

diff --git a/MANIFEST b/MANIFEST
index 25d0192..2d9588c 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -126,6 +126,11 @@ lib/Mail/SpamAssassin/Plugin/WLBLEval.pm
 lib/Mail/SpamAssassin/Plugin/WhiteListSubject.pm
 lib/Mail/SpamAssassin/PluginHandler.pm
 lib/Mail/SpamAssassin/Plugin/URILocalBL.pm
+lib/Mail/SpamAssassin/Pyzor/Client.pm
+lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm
+lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm
+lib/Mail/SpamAssassin/Pyzor/Digest.pm
+lib/Mail/SpamAssassin/Pyzor.pm
 lib/Mail/SpamAssassin/RegistryBoundaries.pm
 lib/Mail/SpamAssassin/Reporter.pm
 lib/Mail/SpamAssassin/SQLBasedAddrList.pm
diff --git a/lib/Mail/SpamAssassin/Plugin/Pyzor.pm b/lib/Mail/SpamAssassin/Plugin/Pyzor.pm
index 3efd4b4..e4c9c05 100644
--- a/lib/Mail/SpamAssassin/Plugin/Pyzor.pm
+++ b/lib/Mail/SpamAssassin/Plugin/Pyzor.pm
@@ -36,17 +36,13 @@ package Mail::SpamAssassin::Plugin::Pyzor;
 
 use Mail::SpamAssassin::Plugin;
 use Mail::SpamAssassin::Logger;
-use Mail::SpamAssassin::Timeout;
-use Mail::SpamAssassin::Util qw(untaint_var untaint_file_path
-                                proc_status_ok exit_status_str);
+use Mail::SpamAssassin::Util qw(untaint_var);
+
 use strict;
 use warnings;
 # use bytes;
 use re 'taint';
 
-use Storable;
-use POSIX qw(PIPE_BUF WNOHANG _exit);
-
 our @ISA = qw(Mail::SpamAssassin::Plugin);
 
 sub new {
@@ -78,7 +74,7 @@ sub set_config {
   my ($self, $conf) = @_;
   my @cmds;
 
-=head1 USER OPTIONS
+=head1 ADMINISTRATOR OPTIONS
 
 =over 4
 
@@ -95,22 +91,7 @@ Whether to use Pyzor, if it is available.
     type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL
   });
 
-=item pyzor_fork (0|1)		(default: 0)
-
-Instead of running Pyzor synchronously, fork separate process for it and
-read the results in later (similar to async DNS lookups).  Increases
-throughput.  Experimental.
-
-=cut
-
-  push(@cmds, {
-    setting => 'pyzor_fork',
-    is_admin => 1,
-    default => 0,
-    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC,
-  });
-
-=item pyzor_count_min NUMBER	(default: 5)
+=item pyzor_count_min NUMBER		(default: 5)
 
 This option sets how often a message's body checksum must have been
 reported to the Pyzor server before SpamAssassin will consider the Pyzor
@@ -128,54 +109,8 @@ set this to a relatively low value, e.g. C<5>.
     type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
   });
 
-  # Deprecated setting, the name makes no sense!
-  push (@cmds, {
-    setting => 'pyzor_max',
-    is_admin => 1,
-    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC,
-    code => sub {
-      my ($self, $key, $value, $line) = @_;
-      warn("deprecated setting used, change pyzor_max to pyzor_count_min\n");
-      if ($value !~ /^\d+$/) {
-        return $Mail::SpamAssassin::Conf::INVALID_VALUE;
-      }
-      $self->{pyzor_count_min} = $value;
-    }
-  });
-
-=item pyzor_whitelist_min NUMBER	(default: 10)
-
-This option sets how often a message's body checksum must have been
-whitelisted to the Pyzor server for SpamAssassin to consider ignoring the
-result.  Final decision is made by pyzor_whitelist_factor.
-
-=cut
-
-  push (@cmds, {
-    setting => 'pyzor_whitelist_min',
-    is_admin => 1,
-    default => 10,
-    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
-  });
-
-=item pyzor_whitelist_factor NUMBER	(default: 0.2)
-
-Ignore Pyzor result if REPORTCOUNT x NUMBER >= pyzor_whitelist_min.
-For default setting this means: 50 reports requires 10 whitelistings.
-
-=cut
-
-  push (@cmds, {
-    setting => 'pyzor_whitelist_factor',
-    is_admin => 1,
-    default => 0.2,
-    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
-  });
-
 =back
 
-=head1 ADMINISTRATOR OPTIONS
-
 =over 4
 
 =item pyzor_timeout n		(default: 5)
@@ -210,478 +145,182 @@ removing one of them.
     type => $Mail::SpamAssassin::Conf::CONF_TYPE_DURATION
   });
 
-=item pyzor_options options
+=item pyzor_whitelist_min NUMBER        (default: 10)
 
-Specify additional options to the pyzor(1) command. Please note that only
-characters in the range [0-9A-Za-z =,._/-] are allowed for security reasons.
+This option sets how often a message's body checksum must have been
+whitelisted to the Pyzor server for SpamAssassin to consider ignoring the
+result.  Final decision is made by pyzor_whitelist_factor.
 
 =cut
 
   push (@cmds, {
-    setting => 'pyzor_options',
+    setting => 'pyzor_whitelist_min',
     is_admin => 1,
-    default => '',
-    type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING,
-    code => sub {
-      my ($self, $key, $value, $line) = @_;
-      if ($value !~ m{^([0-9A-Za-z =,._/-]+)$}) {
-	return $Mail::SpamAssassin::Conf::INVALID_VALUE;
-      }
-      $self->{pyzor_options} = $1;
-    }
+    default => 10,
+    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
   });
 
-=item pyzor_path STRING
+=item pyzor_whitelist_factor NUMBER     (default: 0.2)
 
-This option tells SpamAssassin specifically where to find the C<pyzor>
-client instead of relying on SpamAssassin to find it in the current
-PATH.  Note that if I<taint mode> is enabled in the Perl interpreter,
-you should use this, as the current PATH will have been cleared.
+Ignore Pyzor result if REPORTCOUNT x NUMBER >= pyzor_whitelist_min.
+For default setting this means: 50 reports requires 10 whitelistings.
 
 =cut
 
   push (@cmds, {
-    setting => 'pyzor_path',
+    setting => 'pyzor_whitelist_factor',
     is_admin => 1,
-    default => undef,
-    type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING,
-    code => sub {
-      my ($self, $key, $value, $line) = @_;
-      if (!defined $value || !length $value) {
-	return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE;
-      }
-      $value = untaint_file_path($value);
-      if (!-x $value) {
-	info("config: pyzor_path \"$value\" isn't an executable");
-	return $Mail::SpamAssassin::Conf::INVALID_VALUE;
-      }
-
-      $self->{pyzor_path} = $value;
-    }
+    default => 0.2,
+    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
   });
 
   $conf->{parser}->register_commands(\@cmds);
 }
 
 sub is_pyzor_available {
-  my ($self) = @_;
+    my ($self) = @_;
 
-  my $pyzor = $self->{main}->{conf}->{pyzor_path} ||
-    Mail::SpamAssassin::Util::find_executable_in_env_path('pyzor');
-
-  unless ($pyzor && -x $pyzor) {
-    dbg("pyzor: no pyzor executable found");
-    $self->{pyzor_available} = 0;
-    return 0;
-  }
-
-  # remember any found pyzor
-  $self->{main}->{conf}->{pyzor_path} = $pyzor;
-
-  dbg("pyzor: pyzor is available: $pyzor");
-  return 1;
+    local $@;
+    eval {
+        require Mail::SpamAssassin::Pyzor::Digest;
+        require Mail::SpamAssassin::Pyzor::Client;
+    };
+    return $@ ? 0 : 1;
 }
 
-sub finish_parsing_start {
-  my ($self, $opts) = @_;
+sub get_pyzor_interface {
+  my ($self) = @_;
 
-  # If forking, hard adjust priority -100 to launch early
-  # Find rulenames from eval_to_rule mappings
-  if ($opts->{conf}->{pyzor_fork}) {
-    foreach (@{$opts->{conf}->{eval_to_rule}->{check_pyzor}}) {
-      dbg("pyzor: adjusting rule $_ priority to -100");
-      $opts->{conf}->{priority}->{$_} = -100;
-    }
+  if (!$self->{main}->{conf}->{use_pyzor}) {
+    dbg("pyzor: use_pyzor option not enabled, disabling Pyzor");
+    $self->{pyzor_interface} = "disabled";
+    $self->{pyzor_available} = 0;
+  }
+  elsif ($self->is_pyzor_available()) {
+    $self->{pyzor_interface} = "pyzor";
+    $self->{pyzor_available} = 1;
+  }
+  else {
+    dbg("pyzor: no pyzor found, disabling Pyzor");
+    $self->{pyzor_available} = 0;
   }
 }
 
 sub check_pyzor {
-  my ($self, $pms, $full) = @_;
-
-  return 0 if !$self->{pyzor_available};
-  return 0 if !$self->{main}->{conf}->{use_pyzor};
-
-  return 0 if $pms->{pyzor_running};
-  $pms->{pyzor_running} = 1;
-
-  return 0 if !$self->is_pyzor_available();
-
-  my $timer = $self->{main}->time_method("check_pyzor");
+  my ($self, $permsgstatus, $full) = @_;
 
   # initialize valid tags
-  $pms->{tag_data}->{PYZOR} = '';
-
-  # create fulltext tmpfile now (before possible forking)
-  $pms->{pyzor_tmpfile} = $pms->create_fulltext_tmpfile();
-
-  ## non-forking method
-
-  if (!$self->{main}->{conf}->{pyzor_fork}) {
-    my @results = $self->pyzor_lookup($pms);
-    return $self->_check_result($pms, \@results);
-  }
-
-  ## forking method
-
-  $pms->{pyzor_rulename} = $pms->get_current_eval_rule_name();
-  $pms->rule_pending($pms->{pyzor_rulename}); # mark async
-
-  # create socketpair for communication
-  $pms->{pyzor_backchannel} = Mail::SpamAssassin::SubProcBackChannel->new();
-  my $back_selector = '';
-  $pms->{pyzor_backchannel}->set_selector(\$back_selector);
-  eval {
-    $pms->{pyzor_backchannel}->setup_backchannel_parent_pre_fork();
-  } or do {
-    dbg("pyzor: backchannel pre-setup failed: $@");
-    delete $pms->{pyzor_backchannel};
-    return 0;
-  };
+  $permsgstatus->{tag_data}->{PYZOR} = "";
 
-  my $pid = fork();
-  if (!defined $pid) {
-    info("pyzor: child fork failed: $!");
-    delete $pms->{pyzor_backchannel};
-    return 0;
-  }
-  if (!$pid) {
-    $0 = "$0 (pyzor)";
-    $SIG{CHLD} = 'DEFAULT';
-    $SIG{PIPE} = 'IGNORE';
-    $SIG{$_} = sub {
-      eval { dbg("pyzor: child process $$ caught signal $_[0]"); };
-      _exit(6);  # avoid END and destructor processing
-      kill('KILL',$$);  # still kicking? die!
-      } foreach qw(INT HUP TERM TSTP QUIT USR1 USR2);
-    dbg("pyzor: child process $$ forked");
-    $pms->{pyzor_backchannel}->setup_backchannel_child_post_fork();
-    my @results = $self->pyzor_lookup($pms);
-    my $backmsg;
-    eval {
-      $backmsg = Storable::freeze(\@results);
-    };
-    if ($@) {
-      dbg("pyzor: child return value freeze failed: $@");
-      _exit(0); # avoid END and destructor processing
-    }
-    if (!syswrite($pms->{pyzor_backchannel}->{parent}, $backmsg)) {
-      dbg("pyzor: child backchannel write failed: $!");
-    }
-    _exit(0); # avoid END and destructor processing
-  }
-
-  $pms->{pyzor_pid} = $pid;
+  my $timer = $self->{main}->time_method("check_pyzor");
 
-  eval {
-    $pms->{pyzor_backchannel}->setup_backchannel_parent_post_fork($pid);
-  } or do {
-    dbg("pyzor: backchannel post-setup failed: $@");
-    delete $pms->{pyzor_backchannel};
-    return 0;
-  };
+  $self->get_pyzor_interface();
+  return 0 unless $self->{pyzor_available};
 
-  return 0;
+  return $self->pyzor_lookup($permsgstatus, $full);
 }
 
 sub pyzor_lookup {
-  my ($self, $pms) = @_;
-
-  my $conf = $self->{main}->{conf};
-  my $timeout = $conf->{pyzor_timeout};
-
-  # note: not really tainted, this came from system configuration file
-  my $path = untaint_file_path($conf->{pyzor_path});
-  my $opts = untaint_var($conf->{pyzor_options}) || '';
-
-  $pms->enter_helper_run_mode();
-
-  my $pid;
-  my @resp;
-  my $timer = Mail::SpamAssassin::Timeout->new(
-           { secs => $timeout, deadline => $pms->{master_deadline} });
-  my $err = $timer->run_and_catch(sub {
-    local $SIG{PIPE} = sub { die "__brokenpipe__ignore__\n" };
-
-    dbg("pyzor: opening pipe: ".
-      join(' ', $path, $opts, "check", "<".$pms->{pyzor_tmpfile}));
-
-    $pid = Mail::SpamAssassin::Util::helper_app_pipe_open(*PYZOR,
-	$pms->{pyzor_tmpfile}, 1, $path, split(' ', $opts), "check");
-    $pid or die "$!\n";
-
-    # read+split avoids a Perl I/O bug (Bug 5985)
-    my($inbuf, $nread);
-    my $resp = '';
-    while ($nread = read(PYZOR, $inbuf, 8192)) { $resp .= $inbuf }
-    defined $nread  or die "error reading from pipe: $!";
-    @resp = split(/^/m, $resp, -1);
-
-    my $errno = 0;
-    close PYZOR or $errno = $!;
-    if (proc_status_ok($?, $errno)) {
-      dbg("pyzor: [%s] finished successfully", $pid);
-    } elsif (proc_status_ok($?, $errno, 0, 1)) {  # sometimes it exits with 1
-      dbg("pyzor: [%s] finished: %s", $pid, exit_status_str($?, $errno));
-    } else {
-      info("pyzor: [%s] error: %s", $pid, exit_status_str($?, $errno));
-    }
-
-  });
-
-  if (defined(fileno(*PYZOR))) {  # still open
-    if ($pid) {
-      if (kill('TERM', $pid)) {
-        dbg("pyzor: killed stale helper [$pid]");
-      } else {
-        dbg("pyzor: killing helper application [$pid] failed: $!");
-      }
-    }
-    my $errno = 0;
-    close PYZOR or $errno = $!;
-    proc_status_ok($?, $errno)
-      or info("pyzor: [%s] error: %s", $pid, exit_status_str($?, $errno));
-  }
-
-  $pms->leave_helper_run_mode();
-
-  if ($timer->timed_out()) {
-    dbg("pyzor: check timed out after $timeout seconds");
-    return ();
-  } elsif ($err) {
-    chomp $err;
-    info("pyzor: check failed: $err");
-    return ();
-  }
-
-  return @resp;
-}
-
-sub check_tick {
-  my ($self, $opts) = @_;
-  $self->_check_forked_result($opts->{permsgstatus}, 0);
-}
-
-sub check_cleanup {
-  my ($self, $opts) = @_;
-  $self->_check_forked_result($opts->{permsgstatus}, 1);
-}
-
-sub _check_forked_result {
-  my ($self, $pms, $finish) = @_;
-
-  return 0 if !$pms->{pyzor_backchannel};
-  return 0 if !$pms->{pyzor_pid};
+    my ( $self, $permsgstatus, $fulltext ) = @_;
+    my $conf = $self->{main}->{conf};
+    my $timeout = $conf->{pyzor_timeout};
+
+    my $client = ( $self->{'_pyzor_client'} ||= Mail::SpamAssassin::Pyzor::Client->new( 'timeout' => $timeout ) );
+    my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $fulltext );
+
+    local $@;
+    my $ref = eval { $client->check($digest); };
+    dbg("pyzor: got response: $client->{'_server_host'}");
+    # $client reply must be an hash
+    return 0 if (not (ref $ref eq ref {}));
+    if ($@) {
+        my $err = $@;
 
-  my $timer = $self->{main}->time_method("check_pyzor");
+        $err = eval { $err->get_message() } || $err;
 
-  $pms->{pyzor_abort} = $pms->{deadline_exceeded} || $pms->{shortcircuited};
-
-  my $kid_pid = $pms->{pyzor_pid};
-  # if $finish, force waiting for the child
-  my $pid = waitpid($kid_pid, $finish && !$pms->{pyzor_abort} ? 0 : WNOHANG);
-  if ($pid == 0) {
-    #dbg("pyzor: child process $kid_pid not finished yet, trying later");
-    if ($pms->{pyzor_abort}) {
-      dbg("pyzor: bailing out due to deadline/shortcircuit");
-      kill('TERM', $kid_pid);
-      if (waitpid($kid_pid, WNOHANG) == 0) {
-        sleep(1);
-        if (waitpid($kid_pid, WNOHANG) == 0) {
-          dbg("pyzor: child process $kid_pid still alive, KILL");
-          kill('KILL', $kid_pid);
-          waitpid($kid_pid, 0);
+        warn("pyzor: check failed: $err\n");
+        return 0;
+    } elsif ( defined $ref->{'Code'} and $ref->{'Code'} ne 200 ) {
+        if(defined $ref->{'Code'} and defined $ref->{'Diag'}) {
+          dbg("pyzor: check failed with invalid code: $ref->{'Code'}: $ref->{'Diag'}");
+        } else {
+          dbg("pyzor: check failed with undefined code");
         }
-      }
-      delete $pms->{pyzor_pid};
-      delete $pms->{pyzor_backchannel};
+        return 0;
     }
-    return 0;
-  } elsif ($pid == -1) {
-    # child does not exist?
-    dbg("pyzor: child process $kid_pid already handled?");
-    delete $pms->{pyzor_backchannel};
-    return 0;
-  }
 
-  $pms->rule_ready($pms->{pyzor_rulename}); # mark rule ready for metas
+    my $pyzor_count       = untaint_var($ref->{'Count'}) + 0;
+    my $pyzor_whitelisted = untaint_var($ref->{'WL-Count'}) + 0;
+    my $count_min = $conf->{pyzor_count_min};
+    my $wl_min = $conf->{pyzor_whitelist_min};
 
-  dbg("pyzor: child process $kid_pid finished, reading results");
+    my $wl_limit = $pyzor_whitelisted >= $wl_min ?
+      $pyzor_count * $conf->{pyzor_whitelist_factor} : 0;
 
-  my $backmsg;
-  my $ret = sysread($pms->{pyzor_backchannel}->{latest_kid_fh}, $backmsg, PIPE_BUF);
-  if (!defined $ret || $ret == 0) {
-    dbg("pyzor: could not read result from child: ".($ret == 0 ? 0 : $!));
-    delete $pms->{pyzor_backchannel};
-    return 0;
-  }
-
-  delete $pms->{pyzor_backchannel};
+    $permsgstatus->set_tag('PYZOR', "Reported $pyzor_count times, whitelisted $pyzor_whitelisted times.");
 
-  my $results;
-  eval {
-    $results = Storable::thaw($backmsg);
-  };
-  if ($@) {
-    dbg("pyzor: child return value thaw failed: $@");
-    return;
-  }
-
-  $self->_check_result($pms, $results);
-}
+    dbg("pyzor: result: COUNT=$pyzor_count/$count_min WHITELIST=$pyzor_whitelisted/$wl_min/%.1f",
+      $wl_limit);
 
-sub _check_result {
-  my ($self, $pms, $results) = @_;
-
-  if (!@$results) {
-    dbg("pyzor: no response from server");
-    return 0;
-  }
-
-  my $count = 0;
-  my $count_wl = 0;
-  foreach my $res (@$results) {
-    chomp($res);
-    if ($res =~ /^Traceback/) {
-      info("pyzor: internal error, python traceback seen in response: $res");
+    # Empty body etc results in same hash, we should skip very large numbers..
+    if ($pyzor_count >= 1000000 || $pyzor_whitelisted >= 10000) {
+      dbg("pyzor: result exceeded hardcoded limits, ignoring: count/wl 1000000/10000");
       return 0;
     }
-    dbg("pyzor: got response: $res");
-    # this regexp is intended to be a little bit forgiving
-    if ($res =~ /^\S+\t.*?\t(\d+)\t(\d+)\s*$/) {
-      # until pyzor servers can sync their DBs,
-      # sum counts obtained from all servers
-      $count += untaint_var($1)+0; # crazy but needs untainting
-      $count_wl += untaint_var($2)+0;
-    } else {
-      # warn on failures to parse
-      info("pyzor: failure to parse response \"$res\"");
-    }
-  }
-
-  my $conf = $self->{main}->{conf};
-
-  my $count_min = $conf->{pyzor_count_min};
-  my $wl_min = $conf->{pyzor_whitelist_min};
 
-  my $wl_limit = $count_wl >= $wl_min ?
-    $count * $conf->{pyzor_whitelist_factor} : 0;
-
-  dbg("pyzor: result: COUNT=$count/$count_min WHITELIST=$count_wl/$wl_min/%.1f",
-    $wl_limit);
-  $pms->set_tag('PYZOR', "Reported $count times, whitelisted $count_wl times.");
-
-  # Empty body etc results in same hash, we should skip very large numbers..
-  if ($count >= 1000000 || $count_wl >= 10000) {
-    dbg("pyzor: result exceeded hardcoded limits, ignoring: count/wl 1000000/10000");
-    return 0;
-  }
-
-  # Whitelisted?
-  if ($wl_limit && $count_wl >= $wl_limit) {
-    dbg("pyzor: message whitelisted");
-    return 0;
-  }
+    # Whitelisted?
+    if ($wl_limit && $pyzor_whitelisted >= $wl_limit) {
+      dbg("pyzor: message whitelisted");
+      return 0;
+    }
 
-  if ($count >= $count_min) {
-    if ($conf->{pyzor_fork}) {
-      # forked needs to run got_hit()
-      $pms->got_hit($pms->{pyzor_rulename}, "", ruletype => 'eval');
+    if ( $pyzor_count >= $count_min ) {
+      return 1;
     }
-    return 1;
-  }
 
-  return 0;
+    return 0;
 }
 
 sub plugin_report {
   my ($self, $options) = @_;
 
-  return if !$self->{pyzor_available};
-  return if !$self->{main}->{conf}->{use_pyzor};
-  return if $options->{report}->{options}->{dont_report_to_pyzor};
-  return if !$self->is_pyzor_available();
-
-  # use temporary file: open2() is unreliable due to buffering under spamd
-  my $tmpf = $options->{report}->create_fulltext_tmpfile($options->{text});
-  if ($self->pyzor_report($options, $tmpf)) {
-    $options->{report}->{report_available} = 1;
-    info("reporter: spam reported to Pyzor");
-    $options->{report}->{report_return} = 1;
-  }
-  else {
-    info("reporter: could not report spam to Pyzor");
-  }
-  $options->{report}->delete_fulltext_tmpfile($tmpf);
+  return unless $self->{pyzor_available};
+  return unless $self->{main}->{conf}->{use_pyzor};
 
-  return 1;
+  if (!$options->{report}->{options}->{dont_report_to_pyzor} && $self->is_pyzor_available())
+  {
+    if ($self->pyzor_report($options)) {
+      $options->{report}->{report_available} = 1;
+      info("reporter: spam reported to Pyzor");
+      $options->{report}->{report_return} = 1;
+    }
+    else {
+      info("reporter: could not report spam to Pyzor");
+    }
+  }
 }
 
 sub pyzor_report {
-  my ($self, $options, $tmpf) = @_;
-
-  # note: not really tainted, this came from system configuration file
-  my $path = untaint_file_path($options->{report}->{conf}->{pyzor_path});
-  my $opts = untaint_var($options->{report}->{conf}->{pyzor_options}) || '';
+    my ( $self, $options ) = @_;
 
-  my $timeout = $self->{main}->{conf}->{pyzor_timeout};
+    my $timeout = $self->{main}->{conf}->{pyzor_timeout};
 
-  $options->{report}->enter_helper_run_mode();
+    my $client = ( $self->{'_pyzor_client'} ||= Mail::SpamAssassin::Pyzor::Client->new( 'timeout' => $timeout ) );
 
-  my $timer = Mail::SpamAssassin::Timeout->new({ secs => $timeout });
-  my $err = $timer->run_and_catch(sub {
+    my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $options->{'text'} );
 
-    local $SIG{PIPE} = sub { die "__brokenpipe__ignore__\n" };
-
-    dbg("pyzor: opening pipe: " . join(' ', $path, $opts, "report", "< $tmpf"));
-
-    my $pid = Mail::SpamAssassin::Util::helper_app_pipe_open(*PYZOR,
-	$tmpf, 1, $path, split(' ', $opts), "report");
-    $pid or die "$!\n";
-
-    my($inbuf,$nread,$nread_all); $nread_all = 0;
-    # response is ignored, just check its existence
-    while ( $nread=read(PYZOR,$inbuf,8192) ) { $nread_all += $nread }
-    defined $nread  or die "error reading from pipe: $!";
-
-    dbg("pyzor: empty response")  if $nread_all < 1;
-
-    my $errno = 0;  close PYZOR or $errno = $!;
-    # closing a pipe also waits for the process executing on the pipe to
-    # complete, no need to explicitly call waitpid
-    # my $child_stat = waitpid($pid,0) > 0 ? $? : undef;
-    if (proc_status_ok($?,$errno, 0)) {
-      dbg("pyzor: [%s] reporter finished successfully", $pid);
-    } else {
-      info("pyzor: [%s] reporter error: %s", $pid, exit_status_str($?,$errno));
+    local $@;
+    my $ref = eval { $client->report($digest); };
+    if ($@) {
+        warn("pyzor: report failed: $@");
+        return 0;
     }
-
-  });
-
-  $options->{report}->leave_helper_run_mode();
-
-  if ($timer->timed_out()) {
-    dbg("reporter: pyzor report timed out after $timeout seconds");
-    return 0;
-  }
-
-  if ($err) {
-    chomp $err;
-    if ($err eq '__brokenpipe__ignore__') {
-      dbg("reporter: pyzor report failed: broken pipe");
-    } else {
-      warn("reporter: pyzor report failed: $err\n");
+    elsif ( $ref->{'Code'} ne 200 ) {
+        dbg("pyzor: report failed with invalid code: $ref->{'Code'}: $ref->{'Diag'}");
+        return 0;
     }
-    return 0;
-  }
 
-  return 1;
+    return 1;
 }
 
-# Version features
-sub has_fork { 1 }
-
 1;
-
-=back
-
-=cut
diff --git a/lib/Mail/SpamAssassin/Pyzor.pm b/lib/Mail/SpamAssassin/Pyzor.pm
new file mode 100644
index 0000000..8ac27f4
--- /dev/null
+++ b/lib/Mail/SpamAssassin/Pyzor.pm
@@ -0,0 +1,56 @@
+package Mail::SpamAssassin::Pyzor;
+
+# Copyright 2018 cPanel, LLC.
+# All rights reserved.
+# http://cpanel.net
+#
+# <@LICENSE>
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at:
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </...@LICENSE>
+#
+
+use strict;
+use warnings;
+
+our $VERSION = '0.06_01';
+
+=encoding utf-8
+
+=head1 NAME
+
+Mail::SpamAssassin::Pyzor - Pyzor spam filtering in Perl
+
+=head1 DESCRIPTION
+
+This distribution contains Perl implementations of parts of
+L<Pyzor|http://pyzor.org>, a tool for use in spam email filtering.
+It is intended for use with L<Mail::SpamAssassin> but may be useful
+in other contexts.
+
+See the following modules for information on specific tools that
+the distribution includes:
+
+=over
+
+=item * L<Mail::SpamAssassin::Pyzor::Client>
+
+=item * L<Mail::SpamAssassin::Pyzor::Digest>
+
+=back
+
+=cut
+
+1;
diff --git a/lib/Mail/SpamAssassin/Pyzor/Client.pm b/lib/Mail/SpamAssassin/Pyzor/Client.pm
new file mode 100644
index 0000000..ccff868
--- /dev/null
+++ b/lib/Mail/SpamAssassin/Pyzor/Client.pm
@@ -0,0 +1,415 @@
+package Mail::SpamAssassin::Pyzor::Client;
+
+# Copyright 2018 cPanel, LLC.
+# All rights reserved.
+# http://cpanel.net
+#
+# <@LICENSE>
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at:
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </...@LICENSE>
+#
+
+use strict;
+use warnings;
+
+=encoding utf-8
+
+=head1 NAME
+
+Mail::SpamAssassin::Pyzor::Client - Pyzor client logic
+
+=head1 SYNOPSIS
+
+    use Mail::SpamAssassin::Pyzor::Client ();
+    use Mail::SpamAssassin::Pyzor::Digest ();
+
+    my $client = Mail::SpamAssassin::Pyzor::Client->new();
+
+    my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $msg );
+
+    my $check_ref = $client->check($digest);
+    die $check_ref->{'Diag'} if $check_ref->{'Code'} ne '200';
+
+    my $report_ref = $client->report($digest);
+    die $report_ref->{'Diag'} if $report_ref->{'Code'} ne '200';
+
+=head1 DESCRIPTION
+
+A bare-bones L<Pyzor|http://pyzor.org> client that currently only
+implements the functionality needed for L<Mail::SpamAssassin>.
+
+=head1 PROTOCOL DETAILS
+
+The Pyzor protocol is not a published standard, and there appears to be
+no meaningful public documentation. What follows is enough information,
+largely gleaned through forum posts and reverse engineering, to facilitate
+effective use of this module:
+
+Pyzor is an RPC-oriented, message-based protocol. Each message
+is a simple dictionary of 7-bit ASCII keys and values. Server responses
+always include at least the following:
+
+=over
+
+=item * C<Code> - Similar to HTTP status codes; anything besides C<200>
+is an error.
+
+=item * C<Diag> - Similar to HTTP status reasons: a text description
+of the status.
+
+=back
+
+(NB: There are additional standard response headers that are useful only for
+the protocol itself and thus are not part of this module’s returns.)
+
+=head2 Reliability
+
+Pyzor uses UDP rather than TCP, so no message is guaranteed to reach its
+destination. A transmission failure can happen in either the request or
+the response; in either case, a timeout error will result. Such errors
+are represented as thrown instances of L<Mail::Pyzor::X::Timeout>.
+
+=cut
+
+#----------------------------------------------------------------------
+
+our $VERSION = '0.04';
+
+our $DEFAULT_SERVER_HOST    = 'public.pyzor.org';
+our $DEFAULT_SERVER_PORT    = 24441;
+our $DEFAULT_USERNAME       = 'anonymous';
+our $DEFAULT_PASSWORD       = '';
+our $DEFAULT_OP_SPEC        = '20,3,60,3';
+our $PYZOR_PROTOCOL_VERSION = 2.1;
+our $DEFAULT_TIMEOUT        = 3.5;
+our $READ_SIZE              = 8192;
+
+use IO::Socket::INET ();
+use Digest::SHA qw(sha1 sha1_hex);
+
+my @hash_order = ( 'Op', 'Op-Digest', 'Op-Spec', 'Thread', 'PV', 'User', 'Time', 'Sig' );
+
+#----------------------------------------------------------------------
+
+=head1 CONSTRUCTOR
+
+=head2 new(%OPTS)
+
+Create a new pyzor client.
+
+=over 2
+
+=item Input
+
+%OPTS are (all optional):
+
+=over 3
+
+=item * C<server_host> - The pyzor server host to connect to (default is
+C<public.pyzor.org>)
+
+=item * C<server_port> - The pyzor server port to connect to (default is
+24441)
+
+=item * C<username> - The username to present to the pyzor server (default
+is C<anonymous>)
+
+=item * C<password> - The password to present to the pyzor server (default
+is empty)
+
+=item * C<timeout> - The maximum time, in seconds, to wait for a response
+from the pyzor server (defeault is 3.5)
+
+=back
+
+=item Output
+
+=over 3
+
+Returns a L<Mail::SpamAssassin::Pyzor::Client> object.
+
+=back
+
+=back
+
+=cut
+
+sub new {
+    my ( $class, %OPTS ) = @_;
+
+    return bless {
+        '_server_host' => $OPTS{'server_host'} || $DEFAULT_SERVER_HOST,
+        '_server_port' => $OPTS{'server_port'} || $DEFAULT_SERVER_PORT,
+        '_username'    => $OPTS{'username'}    || $DEFAULT_USERNAME,
+        '_password'    => $OPTS{'password'}    || $DEFAULT_PASSWORD,
+        '_op_spec'     => $DEFAULT_OP_SPEC,
+        '_timeout'     => $OPTS{'timeout'} || $DEFAULT_TIMEOUT,
+    }, $class;
+}
+
+#----------------------------------------------------------------------
+
+=head1 REQUEST METHODS
+
+=head2 report($digest)
+
+Report the digest of a spam message to the pyzor server. This function
+will throw if a messaging failure or timeout happens.
+
+=over 2
+
+=item Input
+
+=over 3
+
+=item $digest C<SCALAR>
+
+The message digest to report, as given by
+C<Mail::SpamAssassin::Pyzor::Digest::get()>.
+
+=back
+
+=item Output
+
+=over 3
+
+=item C<HASHREF>
+
+Returns a hashref of the standard attributes noted above.
+
+=back
+
+=back
+
+=cut
+
+sub report {
+    my ( $self, $digest ) = @_;
+
+    my $msg_ref = $self->_get_base_msg( 'report', $digest );
+
+    $msg_ref->{'Op-Spec'} = $self->{'_op_spec'};
+
+    return $self->_send_receive_msg($msg_ref);
+}
+
+=head2 check($digest)
+
+Check the digest of a message to see if
+the pyzor server has a report for it. This function
+will throw if a messaging failure or timeout happens.
+
+=over 2
+
+=item Input
+
+=over 3
+
+=item $digest C<SCALAR>
+
+The message digest to check, as given by
+C<Mail::SpamAssassin::Pyzor::Digest::get()>.
+
+=back
+
+=item Output
+
+=over 3
+
+=item C<HASHREF>
+
+Returns a hashref of the standard attributes noted above
+as well as the following:
+
+=over
+
+=item * C<Count> - The number of reports the server has received
+for the given digest.
+
+=item * C<WL-Count> - The number of whitelist requests the server has received
+for the given digest.
+
+=back
+
+=back
+
+=back
+
+=cut
+
+sub check {
+    my ( $self, $digest ) = @_;
+
+    return $self->_send_receive_msg( $self->_get_base_msg( 'check', $digest ) );
+}
+
+# ----------------------------------------
+
+sub _send_receive_msg {
+    my ( $self, $msg_ref ) = @_;
+
+    my $thread_id = $msg_ref->{'Thread'} or warn 'No thread ID?';
+
+    $self->_sign_msg($msg_ref);
+
+    return $self->_do_send_receive(
+        $self->_generate_packet_from_message($msg_ref) . "\n\n",
+        $thread_id,
+    );
+}
+
+sub _get_base_msg {
+    my ( $self, $op, $digest ) = @_;
+
+    die "Implementor error: op is required" if !$op;
+    die "error: digest is required"         if !$digest;
+
+    return {
+        'User'      => $self->{'_username'},
+        'PV'        => $PYZOR_PROTOCOL_VERSION,
+        'Time'      => time(),
+        'Op'        => $op,
+        'Op-Digest' => $digest,
+        'Thread'    => $self->_generate_thread_id()
+    };
+}
+
+sub _do_send_receive {
+    my ( $self, $packet, $thread_id ) = @_;
+
+    my $sock = $self->_get_connection_or_die();
+
+    $self->_send_packet( $sock, $packet );
+    my $response = $self->_receive_packet( $sock, $thread_id );
+
+    return 0 if not defined $response;
+
+    my $resp_hr = { map { ( split(m{: }) )[ 0, 1 ] } split( m{\n}, $response ) };
+
+    delete $resp_hr->{'Thread'};
+
+    my $response_pv = delete $resp_hr->{'PV'};
+
+    if ( $PYZOR_PROTOCOL_VERSION ne $response_pv ) {
+        warn "Unexpected protocol version ($response_pv) in Pyzor response!";
+    }
+
+    return $resp_hr;
+}
+
+sub _receive_packet {
+    my ( $self, $sock, $thread_id ) = @_;
+
+    my $timeout = $self->{'_timeout'} * 1000;
+
+    my $end_time = time + $self->{'_timeout'};
+
+    $sock->blocking(0);
+    my $response = '';
+    my $rout     = '';
+    my $rin      = '';
+    vec( $rin, fileno($sock), 1 ) = 1;
+
+    while (1) {
+        my $time_left = $end_time - time;
+
+        if ( $time_left <= 0 ) {
+          warn("Did not receive a response from the pyzor server $self->{'_server_host'}:$self->{'_server_port'} for $self->{'_timeout'} seconds!");
+          return;
+        }
+
+        my $bytes = sysread( $sock, $response, $READ_SIZE, length $response );
+        if ( !defined($bytes) && !$!{'EAGAIN'} && !$!{'EWOULDBLOCK'} ) {
+            warn "read from socket: $!";
+        }
+
+        if ( index( $response, "\n\n" ) > -1 ) {
+
+            # Reject the response unless its thread ID matches what we sent.
+            # This prevents confusion among concurrent Pyzor reqeusts.
+            if ( index( $response, "\nThread: $thread_id\n" ) != -1 ) {
+                last;
+            }
+            else {
+                $response = '';
+            }
+        }
+
+        my $found = select( $rout = $rin, undef, undef, $time_left );
+        warn "select(): $!" if $found == -1;
+    }
+
+    return $response;
+}
+
+sub _send_packet {
+    my ( $self, $sock, $packet ) = @_;
+
+    $sock->blocking(1);
+    syswrite( $sock, $packet ) or warn "write to socket: $!";
+
+    return;
+}
+
+sub _get_connection_or_die {
+    my ($self) = @_;
+
+    # clear the socket if the PID changes
+    if ( defined $self->{'_sock_pid'} && $self->{'_sock_pid'} != $$ ) {
+        undef $self->{'_sock_pid'};
+        undef $self->{'_sock'};
+    }
+
+    $self->{'_sock_pid'} ||= $$;
+    $self->{'_sock'}     ||= IO::Socket::INET->new(
+        'PeerHost' => $self->{'_server_host'},
+        'PeerPort' => $self->{'_server_port'},
+        'Proto'    => 'udp'
+    ) or die "Cannot connect to $self->{'_server_host'}:$self->{'_server_port'}: $@ $!";
+
+    return $self->{'_sock'};
+}
+
+sub _sign_msg {
+    my ( $self, $msg_ref ) = @_;
+
+    $msg_ref->{'Sig'} = lc Digest::SHA::sha1_hex(
+        Digest::SHA::sha1( $self->_generate_packet_from_message($msg_ref) )
+    );
+
+    return 1;
+}
+
+sub _generate_packet_from_message {
+    my ( $self, $msg_ref ) = @_;
+
+    return join( "\n", map { "$_: $msg_ref->{$_}" } grep { length $msg_ref->{$_} } @hash_order );
+}
+
+sub _generate_thread_id {
+    my $RAND_MAX = 2**16;
+    my $val      = 0;
+    $val = int rand($RAND_MAX) while $val < 1024;
+    return $val;
+}
+
+sub _get_user_pass_hash_key {
+    my ($self) = @_;
+
+    return lc Digest::SHA::sha1_hex( $self->{'_username'} . ':' . $self->{'_password'} );
+}
+
+1;
diff --git a/lib/Mail/SpamAssassin/Pyzor/Digest.pm b/lib/Mail/SpamAssassin/Pyzor/Digest.pm
new file mode 100644
index 0000000..0e8a5ae
--- /dev/null
+++ b/lib/Mail/SpamAssassin/Pyzor/Digest.pm
@@ -0,0 +1,103 @@
+package Mail::SpamAssassin::Pyzor::Digest;
+
+# Copyright 2018 cPanel, LLC.
+# All rights reserved.
+# http://cpanel.net
+#
+# <@LICENSE>
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at:
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </...@LICENSE>
+#
+
+use strict;
+use warnings;
+
+=encoding utf-8
+
+=head1 NAME
+
+Mail::SpamAssassin::Pyzor::Digest
+
+=head1 SYNOPSIS
+
+    my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $mime_text );
+
+=head1 DESCRIPTION
+
+A reimplementation of L<https://github.com/SpamExperts/pyzor/blob/master/pyzor/digest.py>.
+
+=cut
+
+#----------------------------------------------------------------------
+
+use Email::MIME ();
+
+use Mail::SpamAssassin::Pyzor::Digest::Pieces ();
+use Digest::SHA qw(sha1_hex);
+
+our $VERSION = '0.03';
+
+#----------------------------------------------------------------------
+
+=head1 FUNCTIONS
+
+=head2 $hex = get( $MSG )
+
+This takes an email message in raw MIME text format (i.e., as saved in the
+standard mbox format) and returns the message’s Pyzor digest in lower-case
+hexadecimal.
+
+The output from this function should normally be identical to that of
+the C<pyzor> script’s C<digest> command. It is suitable for use in
+L<Mail::SpamAssassin::Pyzor::Client>’s request methods.
+
+=cut
+
+sub get {
+    my ($text) = @_;
+    return Digest::SHA::sha1_hex( ${ _get_predigest( $text ) } );
+}
+
+# NB: This is called from the test.
+sub _get_predigest {    ## no critic qw(RequireArgUnpacking)
+    my ($msg_text_sr) = @_;
+
+    my $parsed = Email::MIME->new($$msg_text_sr);
+
+    my @lines;
+
+    my $payloads_ar = Mail::SpamAssassin::Pyzor::Digest::Pieces::digest_payloads($parsed);
+
+    for my $payload (@$payloads_ar) {
+        my @p_lines = Mail::SpamAssassin::Pyzor::Digest::Pieces::splitlines($payload);
+        for my $line (@p_lines) {
+            Mail::SpamAssassin::Pyzor::Digest::Pieces::normalize($line);
+
+            next if !Mail::SpamAssassin::Pyzor::Digest::Pieces::should_handle_line($line);
+
+            # Make sure we have an octet string.
+            utf8::encode($line) if utf8::is_utf8($line);
+
+            push @lines, $line;
+        }
+    }
+
+    my $digest_sr = Mail::SpamAssassin::Pyzor::Digest::Pieces::assemble_lines( \@lines );
+
+    return $digest_sr;
+}
+
+1;
diff --git a/lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm b/lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm
new file mode 100644
index 0000000..522accd
--- /dev/null
+++ b/lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm
@@ -0,0 +1,301 @@
+package Mail::SpamAssassin::Pyzor::Digest::Pieces;
+
+# Copyright 2018 cPanel, LLC.
+# All rights reserved.
+# http://cpanel.net
+#
+# <@LICENSE>
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at:
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </...@LICENSE>
+#
+
+use strict;
+use warnings;
+
+=encoding utf-8
+
+=head1 NAME
+
+Mail::SpamAssassin::Pyzor::Digest::Pieces
+
+=head1 DESCRIPTION
+
+This module houses backend logic for L<Mail::SpamAssassin::Pyzor::Digest>.
+
+It reimplements logic found in pyzor’s F<digest.py> module
+(L<https://github.com/SpamExperts/pyzor/blob/master/pyzor/digest.py>).
+
+=cut
+
+#----------------------------------------------------------------------
+
+use Email::MIME::ContentType ();
+use Encode                   ();
+
+our $VERSION = '0.03';
+
+# each tuple is [ offset, length ]
+use constant _HASH_SPEC => ( [ 20, 3 ], [ 60, 3 ] );
+
+use constant {
+    _MIN_LINE_LENGTH => 8,
+
+    _ATOMIC_NUM_LINES => 4,
+};
+
+#----------------------------------------------------------------------
+
+=head1 FUNCTIONS
+
+=head2 $strings_ar = digest_payloads( $EMAIL_MIME )
+
+This imitates the corresponding object method in F<digest.py>.
+It returns a reference to an array of strings. Each string can be either
+a byte string or a character string (e.g., UTF-8 decoded).
+
+NB: RFC 2822 stipulates that message bodies should use CRLF
+line breaks, not plain LF (nor plain CR). L<Email::MIME::Encodings>
+will thus convert any plain CRs in a quoted-printable message
+body into CRLF. Python, though, doesn’t do this, so the output of
+our implementation of C<digest_payloads()> diverges from that of the Python
+original. It doesn’t ultimately make a difference since the line-ending
+whitespace gets trimmed regardless, but it’s necessary to factor in when
+comparing the output of our implementation with the Python output.
+
+=cut
+
+sub digest_payloads {
+    my ($parsed) = @_;
+
+    my @subparts = $parsed->subparts();
+
+    my @payloads;
+
+    if (@subparts) {
+        @payloads = map { @{ digest_payloads($_) } } $parsed->subparts();
+    }
+    else {
+        my ( $main_type, $subtype, $encoding, $encode_check ) = parse_content_type( $parsed->content_type() );
+
+        my $payload;
+
+        if ( $main_type eq 'text' ) {
+
+            # Decode transfer encoding, but leave us as a byte string.
+            # Note that this is where Email::MIME converts plain LF to CRLF.
+            $payload = $parsed->body();
+
+            # This does the actual character decoding (i.e., “charset”).
+            $payload = Encode::decode( $encoding, $payload, $encode_check );
+
+            if ( $subtype eq 'html' ) {
+                require Mail::SpamAssassin::Pyzor::Digest::StripHtml;
+                $payload = Mail::SpamAssassin::Pyzor::Digest::StripHtml::strip($payload);
+            }
+        }
+        else {
+
+            # This does no decoding, even of, e.g., quoted-printable or base64.
+            $payload = $parsed->body_raw();
+        }
+
+        push @payloads, $payload;
+    }
+
+    return \@payloads;
+}
+
+#----------------------------------------------------------------------
+
+=head2 normalize( $STRING )
+
+This imitates the corresponding object method in F<digest.py>.
+It modifies C<$STRING> in-place.
+
+As with the original implementation, if C<$STRING> contains (decoded)
+Unicode characters, those characters will be parsed accordingly. So:
+
+    $str = "123\xc2\xa0";   # [ c2 a0 ] == \u00a0, non-breaking space
+
+    normalize($str);
+
+The above will leave C<$str> alone, but this:
+
+    utf8::decode($str);
+
+    normalize($str);
+
+… will trim off the last two bytes from C<$str>.
+
+=cut
+
+sub normalize {    ## no critic qw( Subroutines::RequireArgUnpacking )
+
+    # NULs are bad, mm-kay?
+    $_[0] =~ tr<\0><>d;
+
+    # NB: Python’s \s without re.UNICODE is the same as Perl’s \s
+    # with the /a modifier.
+    #
+    # https://docs.python.org/2/library/re.html
+    # https://perldoc.perl.org/perlrecharclass.html#Backslash-sequences
+
+    # Python: re.compile(r'\S{10,}')
+    $_[0] =~ s<\S{10,}><>ag;
+
+    # Python: re.compile(r'\S+@\S+')
+    $_[0] =~ s<\S+ @ \S+><>agx;
+
+    # Python: re.compile(r'[a-z]+:\S+', re.IGNORECASE)
+    $_[0] =~ s<[a-zA-Z]+ : \S+><>agx;
+
+    # (from digest.py …)
+    # Make sure we do the whitespace last because some of the previous
+    # patterns rely on whitespace.
+    $_[0] =~ tr< \x09-\x0d><>d;
+
+    # This is fun. digest.py’s normalize() does a non-UNICODE whitespace
+    # strip, then calls strip() on the string, which *will* strip Unicode
+    # whitespace from the ends.
+    $_[0] =~ s<\A\s+><>;
+    $_[0] =~ s<\s+\z><>;
+
+    return;
+}
+
+#----------------------------------------------------------------------
+
+=head2 $yn = should_handle_line( $STRING )
+
+This imitates the corresponding object method in F<digest.py>.
+It returns a boolean.
+
+=cut
+
+sub should_handle_line {
+    return $_[0] && length( $_[0] ) >= _MIN_LINE_LENGTH();
+}
+
+#----------------------------------------------------------------------
+
+=head2 $sr = assemble_lines( \@LINES )
+
+This assembles a string buffer out of @LINES. The string is the buffer
+of octets that will be hashed to produce the message digest.
+
+Each member of @LINES is expected to be an B<octet string>, not a
+character string.
+
+=cut
+
+sub assemble_lines {
+    my ($lines_ar) = @_;
+
+    if ( @$lines_ar <= _ATOMIC_NUM_LINES() ) {
+
+        # cf. handle_atomic() in digest.py
+        return \join( q<>, @$lines_ar );
+    }
+
+    #----------------------------------------------------------------------
+    # cf. handle_atomic() in digest.py
+
+    my $str = q<>;
+
+    for my $ofs_len ( _HASH_SPEC() ) {
+        my ( $offset, $length ) = @$ofs_len;
+
+        for my $i ( 0 .. ( $length - 1 ) ) {
+            my $idx = int( $offset * @$lines_ar / 100 ) + $i;
+
+            next if !defined $lines_ar->[$idx];
+
+            $str .= $lines_ar->[$idx];
+        }
+    }
+
+    return \$str;
+}
+
+#----------------------------------------------------------------------
+
+=head2 ($main, $sub, $encoding, $checkval) = parse_content_type( $CONTENT_TYPE )
+
+=cut
+
+use constant _QUOTED_PRINTABLE_NAMES => (
+    "quopri-codec",
+    "quopri",
+    "quoted-printable",
+    "quotedprintable",
+);
+
+# Make Encode::decode() ignore anything that doesn’t fit the
+# given encoding.
+use constant _encode_check_ignore => q<>;
+
+sub parse_content_type {
+    my ($content_type) = @_;
+
+    $Email::MIME::ContentType::STRICT_PARAMS = 0;
+    my $ct_parse = Email::MIME::ContentType::parse_content_type(
+        $content_type,
+    );
+
+    my $main = $ct_parse->{'type'}    || q<>;
+    my $sub  = $ct_parse->{'subtype'} || q<>;
+
+    my $encoding = $ct_parse->{'attributes'}{'charset'};
+
+    my $checkval;
+
+    if ($encoding) {
+
+        # Lower-case everything, convert underscore to dash, and remove NUL.
+        $encoding =~ tr<A-Z_\0><a-z->d;
+
+        # Apparently pyzor accommodates messages that put the transfer
+        # encoding in the Content-Type.
+        if ( grep { $_ eq $encoding } _QUOTED_PRINTABLE_NAMES() ) {
+            $checkval = Encode::FB_CROAK();
+        }
+    }
+    else {
+        $encoding = 'ascii';
+    }
+
+    # Match Python .decode()’s 'ignore' behavior
+    $checkval ||= \&_encode_check_ignore;
+
+    return ( $main, $sub, $encoding, $checkval );
+}
+
+#----------------------------------------------------------------------
+
+=head2 @lines = splitlines( $TEXT )
+
+Imitates C<str.splitlines()>. (cf. C<pydoc str>)
+
+Returns a plain list in list context. Returns the number of
+items to be returned in scalar context.
+
+=cut
+
+sub splitlines {
+    return split m<\r\n?|\n>, $_[0];
+}
+
+1;
diff --git a/lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm b/lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm
new file mode 100644
index 0000000..2617b4a
--- /dev/null
+++ b/lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm
@@ -0,0 +1,177 @@
+package Mail::SpamAssassin::Pyzor::Digest::StripHtml;
+
+# Copyright 2018 cPanel, LLC.
+# All rights reserved.
+# http://cpanel.net
+#
+# <@LICENSE>
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at:
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </...@LICENSE>
+#
+
+use strict;
+use warnings;
+
+=encoding utf-8
+
+=head1 NAME
+
+Mail::SpamAssassin::Pyzor::Digest::StripHtml
+
+=head1 SYNOPSIS
+
+    my $stripped = Mail::SpamAssassin::Pyzor::Digest::StripHtml::strip($html);
+
+=head1 DESCRIPTION
+
+This module attempts to duplicate pyzor’s HTML-stripping logic.
+
+=head1 ACCURACY
+
+This library cannot achieve 100%, bug-for-bug parity with pyzor
+because to do so would require duplicating Python’s own HTML parsing
+library. Since that library’s output has changed over time, and those
+changes in turn affect pyzor, it’s literally impossible to arrive at
+a single, fully-compatible reimplementation.
+
+That said, all known divergences between pyzor and this library involve
+invalid HTML as input.
+
+Please open bug reports for any divergences you identify, particularly
+if the input is valid HTML.
+
+=cut
+
+#----------------------------------------------------------------------
+
+use HTML::Parser ();
+
+our $VERSION = '0.03';
+
+#----------------------------------------------------------------------
+
+=head1 FUNCTIONS
+
+=head2 $stripped = strip( $HTML )
+
+Give it some HTML, and it’ll give back the stripped text.
+
+In B<general>, the stripping consists of removing tags as well as
+C<E<lt>scriptE<gt>> and C<E<lt>styleE<gt>> elements; however, it also
+removes HTML entities.
+
+This tries very hard to duplicate pyzor’s behavior with invalid HTML.
+
+=cut
+
+sub strip {
+    my ($html) = @_;
+
+    $html =~ s<\A\s+><>;
+    $html =~ s<\s+\z><>;
+
+    my $p = HTML::Parser->new( api_version => 3 );
+
+    my @pieces;
+
+    my $accumulate = 1;
+
+    $p->handler(
+        start => sub {
+            my ($tagname) = @_;
+
+            $accumulate = 0 if $tagname eq 'script';
+            $accumulate = 0 if $tagname eq 'style';
+
+            return;
+        },
+        'tagname',
+    );
+
+    $p->handler(
+        end => sub {
+            $accumulate = 1;
+            return;
+        }
+    );
+
+    $p->handler(
+        text => sub {
+            my ($copy) = @_;
+
+            return if !$accumulate;
+
+            # pyzor’s HTML parser discards HTML entities. On top of that,
+            # we need to match, as closely as possible, pyzor’s handling of
+            # invalid HTML entities … which is a function of Python’s
+            # standard HTML parsing library. This will probably never be
+            # fully compatible with the pyzor, but we can get it close.
+
+            # The original is:
+            #
+            #   re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
+            #
+            # The parsing loop then “backs up” one byte if the last
+            # character isn’t a “;”. We use a look-ahead assertion to
+            # mimic that behavior.
+            $copy =~ s<\&\# (?:[0-9]+ | [xX][0-9a-fA-F]+) (?: ; | \z | (?=[^0-9a-fA-F]) )>< >gx;
+
+            # The original is:
+            #
+            #   re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
+            #
+            # We again use a look-ahead assertion to mimic Python.
+            $copy =~ s<\& [a-zA-Z] [-.a-zA-Z0-9]* (?: ; | \z | (?=[^a-zA-Z0-9]) )>< >gx;
+
+            # Python’s HTMLParser aborts its parsing loop when it encounters
+            # an invalid numeric reference.
+            $copy =~ s<\&\#
+                (?:
+                    [^0-9xX]        # anything but the expected first char
+                    |
+                    [0-9]+[a-fA-F]  # hex within decimal
+                    |
+                    [xX][^0-9a-fA-F]
+                )
+                (.*)
+            ><
+                ( -1 == index($1, ';') ) ? q<> : '&#'
+            >exs;
+
+            # Python’s HTMLParser treats invalid entities as incomplete
+            $copy =~ s<(\&\#?)><$1 >gx;
+
+            $copy =~ s<\A\s+><>;
+            $copy =~ s<\s+\z><>;
+
+            push @pieces, \$copy if length $copy;
+        },
+        'text,tagname',
+    );
+
+    $p->parse($html);
+    $p->eof();
+
+    my $payload = join( q< >, map { $$_ } @pieces );
+
+    # Convert all sequences of whitespace OTHER THAN non-breaking spaces to
+    # plain spaces.
+    $payload =~ s<[^\S\x{a0}]+>< >g;
+
+    return $payload;
+}
+
+1;
diff --git a/t/pyzor.t b/t/pyzor.t
index 891f38d..e4ef83f 100755
--- a/t/pyzor.t
+++ b/t/pyzor.t
@@ -3,12 +3,9 @@
 use lib '.'; use lib 't';
 use SATest; sa_t_init("pyzor");
 
-use constant HAS_PYZOR => eval { $_ = untaint_cmd("which pyzor"); chomp; -x };
-
 use Test::More;
 plan skip_all => "Net tests disabled" unless conf_bool('run_net_tests');
-plan skip_all => "Pyzor executable not found in path" unless HAS_PYZOR;
-plan tests => 8;
+plan tests => 5;
 
 diag('Note: Failures may not be an SpamAssassin bug, as Pyzor tests can fail due to problems with the Pyzor servers.');
 
@@ -30,7 +27,7 @@ tstprefs ("
 sarun ("-t < data/spam/pyzor", \&patterns_run_cb);
 ok_all_patterns();
 # Same with fork
-sarun ("--cf='pyzor_fork 1' -t < data/spam/pyzor", \&patterns_run_cb);
+sarun ("-t < data/spam/pyzor", \&patterns_run_cb);
 ok_all_patterns();
 
 #TESTING FOR HAM
@@ -44,7 +41,3 @@ ok_all_patterns();
 
 sarun ("-D pyzor -t < data/nice/001 2>&1", \&patterns_run_cb);
 ok_all_patterns();
-# same with fork
-sarun ("-D pyzor --cf='pyzor_fork 1' -t < data/nice/001 2>&1", \&patterns_run_cb);
-ok_all_patterns();
-

Re: new Pyzor implementation

Posted by Benny Pedersen <me...@junc.eu>.
On 2021-10-14 16:15, Giovanni Bechis wrote:
> Hi,
> cPanel has developed a native Perl Pyzor implementation for 
> SpamAssassin
> and a diff against SpamAssassin 4.0 follows.
> Atm I am using it in production on a small server, more tests and
> opinions are welcome.
> 
> Original cPanel code is at https://metacpan.org/pod/Mail::Pyzor.

is pyzord still supported in a compatible way ?, asking since i belive 
in pyzord localy could be performance booster to pyzor, but none have 
even created code changes so pyzord would ask other pyzord servers of 
match or even trustness

no go from me with it

Re: new Pyzor implementation

Posted by Benny Pedersen <me...@junc.eu>.
On 2021-10-17 05:06, Kevin A. McGrail wrote:
> No worries there that I know of.

+1

> cPanel has the paperwork for CCLA on file and several people with
> ICLA's as well.  They've given us permission to commit the code too.

good, we can hope cpanel will get gentoo portage, big hint

cpanel is a precompiled problem not seen on gentoo yet, if thay care 
thay listen

> I think it will be better than any dependency on external binaries.

like fuglu python is stable, pyzord and pyzor is aswell, but maybe not 
for perl forkers ? :/

is it time to get rid of shell external tools in spamassassin ?, if so 
good news that can delay

https://github.com/SpamExperts/OrangeAssassin sorry no rush, but i still 
hope for the better

Re: new Pyzor implementation

Posted by Giovanni Bechis <gi...@paclan.it>.
I removed IO::SigGuard dependency from my diff and I will work on deleting Email::MIME as well.
 Giovanni

On 10/17/21 16:58, Henrik K wrote:
> 
> Atleast these seem completely unneeded module dependencies.
> 
> IO::SigGuard (not even found in Ubuntu packages)
> Email::MIME
> 
> So the code should be refactored to use SA methods as necessary.
> 
> 
> On Sat, Oct 16, 2021 at 11:06:07PM -0400, Kevin A. McGrail wrote:
>> No worries there that I know of.
>>
>> cPanel has the paperwork for CCLA on file and several people with ICLA's as
>> well.  They've given us permission to commit the code too.
>>
>> I think it will be better than any dependency on external binaries.
>>
>> Regards,
>>
>> KAM
>>
>> On 10/14/2021 10:37 AM, Henrik K wrote:
>>> If that's the case, I probably wouldn't have any objections.  Not sure if it
>>> requires some Contributor License Agreement from cPanels part (maybe they
>>> already have one), and I guess atleast a bug to make it official..  Sidney
>>> or KAM can probably chime in on the admin side..
>>>
>>>
>>> On Thu, Oct 14, 2021 at 04:32:53PM +0200, Giovanni Bechis wrote:
>>>> Once committed, code will be no more developed by cPanel on CPAN
>>>> and original code will be removed.
>>>>
>>>> I can work to integrate old and new Pyzor versions.
>>>>
>>>>   Giovanni
>>>>
>>>> On Thu, Oct 14, 2021 at 05:27:16PM +0300, Henrik K wrote:
>>>>> If it's developed by cPanel in CPAN, then it should not be committed to SA,
>>>>> unless it's clearly donated to SpamAssassin and removed from CPAN.  Assuming
>>>>> we have developer resources and will to take it aboard.
>>>>>
>>>>> As it is, Plugin/Pyzor.pm should have an option to choose which one to use,
>>>>> as it makes no sense to ditch support for the widely installed original
>>>>> Pyzor.
>>>>>
>>>>>
>>>>> On Thu, Oct 14, 2021 at 04:15:13PM +0200, Giovanni Bechis wrote:
>>>>>> Hi,
>>>>>> cPanel has developed a native Perl Pyzor implementation for SpamAssassin
>>>>>> and a diff against SpamAssassin 4.0 follows.
>>>>>> Atm I am using it in production on a small server, more tests and
>>>>>> opinions are welcome.
>>>>>>
>>>>>> Original cPanel code is at https://metacpan.org/pod/Mail::Pyzor.
>>>>>>
>>>>>>   Cheers
>>>>>>    Giovanni
>>>>>>
>>>>>> diff --git a/MANIFEST b/MANIFEST
>>>>>> index 25d0192..2d9588c 100644
>>>>>> --- a/MANIFEST
>>>>>> +++ b/MANIFEST
>>>>>> @@ -126,6 +126,11 @@ lib/Mail/SpamAssassin/Plugin/WLBLEval.pm
>>>>>>   lib/Mail/SpamAssassin/Plugin/WhiteListSubject.pm
>>>>>>   lib/Mail/SpamAssassin/PluginHandler.pm
>>>>>>   lib/Mail/SpamAssassin/Plugin/URILocalBL.pm
>>>>>> +lib/Mail/SpamAssassin/Pyzor/Client.pm
>>>>>> +lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm
>>>>>> +lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm
>>>>>> +lib/Mail/SpamAssassin/Pyzor/Digest.pm
>>>>>> +lib/Mail/SpamAssassin/Pyzor.pm
>>>>>>   lib/Mail/SpamAssassin/RegistryBoundaries.pm
>>>>>>   lib/Mail/SpamAssassin/Reporter.pm
>>>>>>   lib/Mail/SpamAssassin/SQLBasedAddrList.pm
>>>>>> diff --git a/lib/Mail/SpamAssassin/Plugin/Pyzor.pm b/lib/Mail/SpamAssassin/Plugin/Pyzor.pm
>>>>>> index 3efd4b4..e4c9c05 100644
>>>>>> --- a/lib/Mail/SpamAssassin/Plugin/Pyzor.pm
>>>>>> +++ b/lib/Mail/SpamAssassin/Plugin/Pyzor.pm
>>>>>> @@ -36,17 +36,13 @@ package Mail::SpamAssassin::Plugin::Pyzor;
>>>>>>   use Mail::SpamAssassin::Plugin;
>>>>>>   use Mail::SpamAssassin::Logger;
>>>>>> -use Mail::SpamAssassin::Timeout;
>>>>>> -use Mail::SpamAssassin::Util qw(untaint_var untaint_file_path
>>>>>> -                                proc_status_ok exit_status_str);
>>>>>> +use Mail::SpamAssassin::Util qw(untaint_var);
>>>>>> +
>>>>>>   use strict;
>>>>>>   use warnings;
>>>>>>   # use bytes;
>>>>>>   use re 'taint';
>>>>>> -use Storable;
>>>>>> -use POSIX qw(PIPE_BUF WNOHANG _exit);
>>>>>> -
>>>>>>   our @ISA = qw(Mail::SpamAssassin::Plugin);
>>>>>>   sub new {
>>>>>> @@ -78,7 +74,7 @@ sub set_config {
>>>>>>     my ($self, $conf) = @_;
>>>>>>     my @cmds;
>>>>>> -=head1 USER OPTIONS
>>>>>> +=head1 ADMINISTRATOR OPTIONS
>>>>>>   =over 4
>>>>>> @@ -95,22 +91,7 @@ Whether to use Pyzor, if it is available.
>>>>>>       type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL
>>>>>>     });
>>>>>> -=item pyzor_fork (0|1)		(default: 0)
>>>>>> -
>>>>>> -Instead of running Pyzor synchronously, fork separate process for it and
>>>>>> -read the results in later (similar to async DNS lookups).  Increases
>>>>>> -throughput.  Experimental.
>>>>>> -
>>>>>> -=cut
>>>>>> -
>>>>>> -  push(@cmds, {
>>>>>> -    setting => 'pyzor_fork',
>>>>>> -    is_admin => 1,
>>>>>> -    default => 0,
>>>>>> -    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC,
>>>>>> -  });
>>>>>> -
>>>>>> -=item pyzor_count_min NUMBER	(default: 5)
>>>>>> +=item pyzor_count_min NUMBER		(default: 5)
>>>>>>   This option sets how often a message's body checksum must have been
>>>>>>   reported to the Pyzor server before SpamAssassin will consider the Pyzor
>>>>>> @@ -128,54 +109,8 @@ set this to a relatively low value, e.g. C<5>.
>>>>>>       type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
>>>>>>     });
>>>>>> -  # Deprecated setting, the name makes no sense!
>>>>>> -  push (@cmds, {
>>>>>> -    setting => 'pyzor_max',
>>>>>> -    is_admin => 1,
>>>>>> -    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC,
>>>>>> -    code => sub {
>>>>>> -      my ($self, $key, $value, $line) = @_;
>>>>>> -      warn("deprecated setting used, change pyzor_max to pyzor_count_min\n");
>>>>>> -      if ($value !~ /^\d+$/) {
>>>>>> -        return $Mail::SpamAssassin::Conf::INVALID_VALUE;
>>>>>> -      }
>>>>>> -      $self->{pyzor_count_min} = $value;
>>>>>> -    }
>>>>>> -  });
>>>>>> -
>>>>>> -=item pyzor_whitelist_min NUMBER	(default: 10)
>>>>>> -
>>>>>> -This option sets how often a message's body checksum must have been
>>>>>> -whitelisted to the Pyzor server for SpamAssassin to consider ignoring the
>>>>>> -result.  Final decision is made by pyzor_whitelist_factor.
>>>>>> -
>>>>>> -=cut
>>>>>> -
>>>>>> -  push (@cmds, {
>>>>>> -    setting => 'pyzor_whitelist_min',
>>>>>> -    is_admin => 1,
>>>>>> -    default => 10,
>>>>>> -    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
>>>>>> -  });
>>>>>> -
>>>>>> -=item pyzor_whitelist_factor NUMBER	(default: 0.2)
>>>>>> -
>>>>>> -Ignore Pyzor result if REPORTCOUNT x NUMBER >= pyzor_whitelist_min.
>>>>>> -For default setting this means: 50 reports requires 10 whitelistings.
>>>>>> -
>>>>>> -=cut
>>>>>> -
>>>>>> -  push (@cmds, {
>>>>>> -    setting => 'pyzor_whitelist_factor',
>>>>>> -    is_admin => 1,
>>>>>> -    default => 0.2,
>>>>>> -    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
>>>>>> -  });
>>>>>> -
>>>>>>   =back
>>>>>> -=head1 ADMINISTRATOR OPTIONS
>>>>>> -
>>>>>>   =over 4
>>>>>>   =item pyzor_timeout n		(default: 5)
>>>>>> @@ -210,478 +145,182 @@ removing one of them.
>>>>>>       type => $Mail::SpamAssassin::Conf::CONF_TYPE_DURATION
>>>>>>     });
>>>>>> -=item pyzor_options options
>>>>>> +=item pyzor_whitelist_min NUMBER        (default: 10)
>>>>>> -Specify additional options to the pyzor(1) command. Please note that only
>>>>>> -characters in the range [0-9A-Za-z =,._/-] are allowed for security reasons.
>>>>>> +This option sets how often a message's body checksum must have been
>>>>>> +whitelisted to the Pyzor server for SpamAssassin to consider ignoring the
>>>>>> +result.  Final decision is made by pyzor_whitelist_factor.
>>>>>>   =cut
>>>>>>     push (@cmds, {
>>>>>> -    setting => 'pyzor_options',
>>>>>> +    setting => 'pyzor_whitelist_min',
>>>>>>       is_admin => 1,
>>>>>> -    default => '',
>>>>>> -    type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING,
>>>>>> -    code => sub {
>>>>>> -      my ($self, $key, $value, $line) = @_;
>>>>>> -      if ($value !~ m{^([0-9A-Za-z =,._/-]+)$}) {
>>>>>> -	return $Mail::SpamAssassin::Conf::INVALID_VALUE;
>>>>>> -      }
>>>>>> -      $self->{pyzor_options} = $1;
>>>>>> -    }
>>>>>> +    default => 10,
>>>>>> +    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
>>>>>>     });
>>>>>> -=item pyzor_path STRING
>>>>>> +=item pyzor_whitelist_factor NUMBER     (default: 0.2)
>>>>>> -This option tells SpamAssassin specifically where to find the C<pyzor>
>>>>>> -client instead of relying on SpamAssassin to find it in the current
>>>>>> -PATH.  Note that if I<taint mode> is enabled in the Perl interpreter,
>>>>>> -you should use this, as the current PATH will have been cleared.
>>>>>> +Ignore Pyzor result if REPORTCOUNT x NUMBER >= pyzor_whitelist_min.
>>>>>> +For default setting this means: 50 reports requires 10 whitelistings.
>>>>>>   =cut
>>>>>>     push (@cmds, {
>>>>>> -    setting => 'pyzor_path',
>>>>>> +    setting => 'pyzor_whitelist_factor',
>>>>>>       is_admin => 1,
>>>>>> -    default => undef,
>>>>>> -    type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING,
>>>>>> -    code => sub {
>>>>>> -      my ($self, $key, $value, $line) = @_;
>>>>>> -      if (!defined $value || !length $value) {
>>>>>> -	return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE;
>>>>>> -      }
>>>>>> -      $value = untaint_file_path($value);
>>>>>> -      if (!-x $value) {
>>>>>> -	info("config: pyzor_path \"$value\" isn't an executable");
>>>>>> -	return $Mail::SpamAssassin::Conf::INVALID_VALUE;
>>>>>> -      }
>>>>>> -
>>>>>> -      $self->{pyzor_path} = $value;
>>>>>> -    }
>>>>>> +    default => 0.2,
>>>>>> +    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
>>>>>>     });
>>>>>>     $conf->{parser}->register_commands(\@cmds);
>>>>>>   }
>>>>>>   sub is_pyzor_available {
>>>>>> -  my ($self) = @_;
>>>>>> +    my ($self) = @_;
>>>>>> -  my $pyzor = $self->{main}->{conf}->{pyzor_path} ||
>>>>>> -    Mail::SpamAssassin::Util::find_executable_in_env_path('pyzor');
>>>>>> -
>>>>>> -  unless ($pyzor && -x $pyzor) {
>>>>>> -    dbg("pyzor: no pyzor executable found");
>>>>>> -    $self->{pyzor_available} = 0;
>>>>>> -    return 0;
>>>>>> -  }
>>>>>> -
>>>>>> -  # remember any found pyzor
>>>>>> -  $self->{main}->{conf}->{pyzor_path} = $pyzor;
>>>>>> -
>>>>>> -  dbg("pyzor: pyzor is available: $pyzor");
>>>>>> -  return 1;
>>>>>> +    local $@;
>>>>>> +    eval {
>>>>>> +        require Mail::SpamAssassin::Pyzor::Digest;
>>>>>> +        require Mail::SpamAssassin::Pyzor::Client;
>>>>>> +    };
>>>>>> +    return $@ ? 0 : 1;
>>>>>>   }
>>>>>> -sub finish_parsing_start {
>>>>>> -  my ($self, $opts) = @_;
>>>>>> +sub get_pyzor_interface {
>>>>>> +  my ($self) = @_;
>>>>>> -  # If forking, hard adjust priority -100 to launch early
>>>>>> -  # Find rulenames from eval_to_rule mappings
>>>>>> -  if ($opts->{conf}->{pyzor_fork}) {
>>>>>> -    foreach (@{$opts->{conf}->{eval_to_rule}->{check_pyzor}}) {
>>>>>> -      dbg("pyzor: adjusting rule $_ priority to -100");
>>>>>> -      $opts->{conf}->{priority}->{$_} = -100;
>>>>>> -    }
>>>>>> +  if (!$self->{main}->{conf}->{use_pyzor}) {
>>>>>> +    dbg("pyzor: use_pyzor option not enabled, disabling Pyzor");
>>>>>> +    $self->{pyzor_interface} = "disabled";
>>>>>> +    $self->{pyzor_available} = 0;
>>>>>> +  }
>>>>>> +  elsif ($self->is_pyzor_available()) {
>>>>>> +    $self->{pyzor_interface} = "pyzor";
>>>>>> +    $self->{pyzor_available} = 1;
>>>>>> +  }
>>>>>> +  else {
>>>>>> +    dbg("pyzor: no pyzor found, disabling Pyzor");
>>>>>> +    $self->{pyzor_available} = 0;
>>>>>>     }
>>>>>>   }
>>>>>>   sub check_pyzor {
>>>>>> -  my ($self, $pms, $full) = @_;
>>>>>> -
>>>>>> -  return 0 if !$self->{pyzor_available};
>>>>>> -  return 0 if !$self->{main}->{conf}->{use_pyzor};
>>>>>> -
>>>>>> -  return 0 if $pms->{pyzor_running};
>>>>>> -  $pms->{pyzor_running} = 1;
>>>>>> -
>>>>>> -  return 0 if !$self->is_pyzor_available();
>>>>>> -
>>>>>> -  my $timer = $self->{main}->time_method("check_pyzor");
>>>>>> +  my ($self, $permsgstatus, $full) = @_;
>>>>>>     # initialize valid tags
>>>>>> -  $pms->{tag_data}->{PYZOR} = '';
>>>>>> -
>>>>>> -  # create fulltext tmpfile now (before possible forking)
>>>>>> -  $pms->{pyzor_tmpfile} = $pms->create_fulltext_tmpfile();
>>>>>> -
>>>>>> -  ## non-forking method
>>>>>> -
>>>>>> -  if (!$self->{main}->{conf}->{pyzor_fork}) {
>>>>>> -    my @results = $self->pyzor_lookup($pms);
>>>>>> -    return $self->_check_result($pms, \@results);
>>>>>> -  }
>>>>>> -
>>>>>> -  ## forking method
>>>>>> -
>>>>>> -  $pms->{pyzor_rulename} = $pms->get_current_eval_rule_name();
>>>>>> -  $pms->rule_pending($pms->{pyzor_rulename}); # mark async
>>>>>> -
>>>>>> -  # create socketpair for communication
>>>>>> -  $pms->{pyzor_backchannel} = Mail::SpamAssassin::SubProcBackChannel->new();
>>>>>> -  my $back_selector = '';
>>>>>> -  $pms->{pyzor_backchannel}->set_selector(\$back_selector);
>>>>>> -  eval {
>>>>>> -    $pms->{pyzor_backchannel}->setup_backchannel_parent_pre_fork();
>>>>>> -  } or do {
>>>>>> -    dbg("pyzor: backchannel pre-setup failed: $@");
>>>>>> -    delete $pms->{pyzor_backchannel};
>>>>>> -    return 0;
>>>>>> -  };
>>>>>> +  $permsgstatus->{tag_data}->{PYZOR} = "";
>>>>>> -  my $pid = fork();
>>>>>> -  if (!defined $pid) {
>>>>>> -    info("pyzor: child fork failed: $!");
>>>>>> -    delete $pms->{pyzor_backchannel};
>>>>>> -    return 0;
>>>>>> -  }
>>>>>> -  if (!$pid) {
>>>>>> -    $0 = "$0 (pyzor)";
>>>>>> -    $SIG{CHLD} = 'DEFAULT';
>>>>>> -    $SIG{PIPE} = 'IGNORE';
>>>>>> -    $SIG{$_} = sub {
>>>>>> -      eval { dbg("pyzor: child process $$ caught signal $_[0]"); };
>>>>>> -      _exit(6);  # avoid END and destructor processing
>>>>>> -      kill('KILL',$$);  # still kicking? die!
>>>>>> -      } foreach qw(INT HUP TERM TSTP QUIT USR1 USR2);
>>>>>> -    dbg("pyzor: child process $$ forked");
>>>>>> -    $pms->{pyzor_backchannel}->setup_backchannel_child_post_fork();
>>>>>> -    my @results = $self->pyzor_lookup($pms);
>>>>>> -    my $backmsg;
>>>>>> -    eval {
>>>>>> -      $backmsg = Storable::freeze(\@results);
>>>>>> -    };
>>>>>> -    if ($@) {
>>>>>> -      dbg("pyzor: child return value freeze failed: $@");
>>>>>> -      _exit(0); # avoid END and destructor processing
>>>>>> -    }
>>>>>> -    if (!syswrite($pms->{pyzor_backchannel}->{parent}, $backmsg)) {
>>>>>> -      dbg("pyzor: child backchannel write failed: $!");
>>>>>> -    }
>>>>>> -    _exit(0); # avoid END and destructor processing
>>>>>> -  }
>>>>>> -
>>>>>> -  $pms->{pyzor_pid} = $pid;
>>>>>> +  my $timer = $self->{main}->time_method("check_pyzor");
>>>>>> -  eval {
>>>>>> -    $pms->{pyzor_backchannel}->setup_backchannel_parent_post_fork($pid);
>>>>>> -  } or do {
>>>>>> -    dbg("pyzor: backchannel post-setup failed: $@");
>>>>>> -    delete $pms->{pyzor_backchannel};
>>>>>> -    return 0;
>>>>>> -  };
>>>>>> +  $self->get_pyzor_interface();
>>>>>> +  return 0 unless $self->{pyzor_available};
>>>>>> -  return 0;
>>>>>> +  return $self->pyzor_lookup($permsgstatus, $full);
>>>>>>   }
>>>>>>   sub pyzor_lookup {
>>>>>> -  my ($self, $pms) = @_;
>>>>>> -
>>>>>> -  my $conf = $self->{main}->{conf};
>>>>>> -  my $timeout = $conf->{pyzor_timeout};
>>>>>> -
>>>>>> -  # note: not really tainted, this came from system configuration file
>>>>>> -  my $path = untaint_file_path($conf->{pyzor_path});
>>>>>> -  my $opts = untaint_var($conf->{pyzor_options}) || '';
>>>>>> -
>>>>>> -  $pms->enter_helper_run_mode();
>>>>>> -
>>>>>> -  my $pid;
>>>>>> -  my @resp;
>>>>>> -  my $timer = Mail::SpamAssassin::Timeout->new(
>>>>>> -           { secs => $timeout, deadline => $pms->{master_deadline} });
>>>>>> -  my $err = $timer->run_and_catch(sub {
>>>>>> -    local $SIG{PIPE} = sub { die "__brokenpipe__ignore__\n" };
>>>>>> -
>>>>>> -    dbg("pyzor: opening pipe: ".
>>>>>> -      join(' ', $path, $opts, "check", "<".$pms->{pyzor_tmpfile}));
>>>>>> -
>>>>>> -    $pid = Mail::SpamAssassin::Util::helper_app_pipe_open(*PYZOR,
>>>>>> -	$pms->{pyzor_tmpfile}, 1, $path, split(' ', $opts), "check");
>>>>>> -    $pid or die "$!\n";
>>>>>> -
>>>>>> -    # read+split avoids a Perl I/O bug (Bug 5985)
>>>>>> -    my($inbuf, $nread);
>>>>>> -    my $resp = '';
>>>>>> -    while ($nread = read(PYZOR, $inbuf, 8192)) { $resp .= $inbuf }
>>>>>> -    defined $nread  or die "error reading from pipe: $!";
>>>>>> -    @resp = split(/^/m, $resp, -1);
>>>>>> -
>>>>>> -    my $errno = 0;
>>>>>> -    close PYZOR or $errno = $!;
>>>>>> -    if (proc_status_ok($?, $errno)) {
>>>>>> -      dbg("pyzor: [%s] finished successfully", $pid);
>>>>>> -    } elsif (proc_status_ok($?, $errno, 0, 1)) {  # sometimes it exits with 1
>>>>>> -      dbg("pyzor: [%s] finished: %s", $pid, exit_status_str($?, $errno));
>>>>>> -    } else {
>>>>>> -      info("pyzor: [%s] error: %s", $pid, exit_status_str($?, $errno));
>>>>>> -    }
>>>>>> -
>>>>>> -  });
>>>>>> -
>>>>>> -  if (defined(fileno(*PYZOR))) {  # still open
>>>>>> -    if ($pid) {
>>>>>> -      if (kill('TERM', $pid)) {
>>>>>> -        dbg("pyzor: killed stale helper [$pid]");
>>>>>> -      } else {
>>>>>> -        dbg("pyzor: killing helper application [$pid] failed: $!");
>>>>>> -      }
>>>>>> -    }
>>>>>> -    my $errno = 0;
>>>>>> -    close PYZOR or $errno = $!;
>>>>>> -    proc_status_ok($?, $errno)
>>>>>> -      or info("pyzor: [%s] error: %s", $pid, exit_status_str($?, $errno));
>>>>>> -  }
>>>>>> -
>>>>>> -  $pms->leave_helper_run_mode();
>>>>>> -
>>>>>> -  if ($timer->timed_out()) {
>>>>>> -    dbg("pyzor: check timed out after $timeout seconds");
>>>>>> -    return ();
>>>>>> -  } elsif ($err) {
>>>>>> -    chomp $err;
>>>>>> -    info("pyzor: check failed: $err");
>>>>>> -    return ();
>>>>>> -  }
>>>>>> -
>>>>>> -  return @resp;
>>>>>> -}
>>>>>> -
>>>>>> -sub check_tick {
>>>>>> -  my ($self, $opts) = @_;
>>>>>> -  $self->_check_forked_result($opts->{permsgstatus}, 0);
>>>>>> -}
>>>>>> -
>>>>>> -sub check_cleanup {
>>>>>> -  my ($self, $opts) = @_;
>>>>>> -  $self->_check_forked_result($opts->{permsgstatus}, 1);
>>>>>> -}
>>>>>> -
>>>>>> -sub _check_forked_result {
>>>>>> -  my ($self, $pms, $finish) = @_;
>>>>>> -
>>>>>> -  return 0 if !$pms->{pyzor_backchannel};
>>>>>> -  return 0 if !$pms->{pyzor_pid};
>>>>>> +    my ( $self, $permsgstatus, $fulltext ) = @_;
>>>>>> +    my $conf = $self->{main}->{conf};
>>>>>> +    my $timeout = $conf->{pyzor_timeout};
>>>>>> +
>>>>>> +    my $client = ( $self->{'_pyzor_client'} ||= Mail::SpamAssassin::Pyzor::Client->new( 'timeout' => $timeout ) );
>>>>>> +    my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $fulltext );
>>>>>> +
>>>>>> +    local $@;
>>>>>> +    my $ref = eval { $client->check($digest); };
>>>>>> +    dbg("pyzor: got response: $client->{'_server_host'}");
>>>>>> +    # $client reply must be an hash
>>>>>> +    return 0 if (not (ref $ref eq ref {}));
>>>>>> +    if ($@) {
>>>>>> +        my $err = $@;
>>>>>> -  my $timer = $self->{main}->time_method("check_pyzor");
>>>>>> +        $err = eval { $err->get_message() } || $err;
>>>>>> -  $pms->{pyzor_abort} = $pms->{deadline_exceeded} || $pms->{shortcircuited};
>>>>>> -
>>>>>> -  my $kid_pid = $pms->{pyzor_pid};
>>>>>> -  # if $finish, force waiting for the child
>>>>>> -  my $pid = waitpid($kid_pid, $finish && !$pms->{pyzor_abort} ? 0 : WNOHANG);
>>>>>> -  if ($pid == 0) {
>>>>>> -    #dbg("pyzor: child process $kid_pid not finished yet, trying later");
>>>>>> -    if ($pms->{pyzor_abort}) {
>>>>>> -      dbg("pyzor: bailing out due to deadline/shortcircuit");
>>>>>> -      kill('TERM', $kid_pid);
>>>>>> -      if (waitpid($kid_pid, WNOHANG) == 0) {
>>>>>> -        sleep(1);
>>>>>> -        if (waitpid($kid_pid, WNOHANG) == 0) {
>>>>>> -          dbg("pyzor: child process $kid_pid still alive, KILL");
>>>>>> -          kill('KILL', $kid_pid);
>>>>>> -          waitpid($kid_pid, 0);
>>>>>> +        warn("pyzor: check failed: $err\n");
>>>>>> +        return 0;
>>>>>> +    } elsif ( defined $ref->{'Code'} and $ref->{'Code'} ne 200 ) {
>>>>>> +        if(defined $ref->{'Code'} and defined $ref->{'Diag'}) {
>>>>>> +          dbg("pyzor: check failed with invalid code: $ref->{'Code'}: $ref->{'Diag'}");
>>>>>> +        } else {
>>>>>> +          dbg("pyzor: check failed with undefined code");
>>>>>>           }
>>>>>> -      }
>>>>>> -      delete $pms->{pyzor_pid};
>>>>>> -      delete $pms->{pyzor_backchannel};
>>>>>> +        return 0;
>>>>>>       }
>>>>>> -    return 0;
>>>>>> -  } elsif ($pid == -1) {
>>>>>> -    # child does not exist?
>>>>>> -    dbg("pyzor: child process $kid_pid already handled?");
>>>>>> -    delete $pms->{pyzor_backchannel};
>>>>>> -    return 0;
>>>>>> -  }
>>>>>> -  $pms->rule_ready($pms->{pyzor_rulename}); # mark rule ready for metas
>>>>>> +    my $pyzor_count       = untaint_var($ref->{'Count'}) + 0;
>>>>>> +    my $pyzor_whitelisted = untaint_var($ref->{'WL-Count'}) + 0;
>>>>>> +    my $count_min = $conf->{pyzor_count_min};
>>>>>> +    my $wl_min = $conf->{pyzor_whitelist_min};
>>>>>> -  dbg("pyzor: child process $kid_pid finished, reading results");
>>>>>> +    my $wl_limit = $pyzor_whitelisted >= $wl_min ?
>>>>>> +      $pyzor_count * $conf->{pyzor_whitelist_factor} : 0;
>>>>>> -  my $backmsg;
>>>>>> -  my $ret = sysread($pms->{pyzor_backchannel}->{latest_kid_fh}, $backmsg, PIPE_BUF);
>>>>>> -  if (!defined $ret || $ret == 0) {
>>>>>> -    dbg("pyzor: could not read result from child: ".($ret == 0 ? 0 : $!));
>>>>>> -    delete $pms->{pyzor_backchannel};
>>>>>> -    return 0;
>>>>>> -  }
>>>>>> -
>>>>>> -  delete $pms->{pyzor_backchannel};
>>>>>> +    $permsgstatus->set_tag('PYZOR', "Reported $pyzor_count times, whitelisted $pyzor_whitelisted times.");
>>>>>> -  my $results;
>>>>>> -  eval {
>>>>>> -    $results = Storable::thaw($backmsg);
>>>>>> -  };
>>>>>> -  if ($@) {
>>>>>> -    dbg("pyzor: child return value thaw failed: $@");
>>>>>> -    return;
>>>>>> -  }
>>>>>> -
>>>>>> -  $self->_check_result($pms, $results);
>>>>>> -}
>>>>>> +    dbg("pyzor: result: COUNT=$pyzor_count/$count_min WHITELIST=$pyzor_whitelisted/$wl_min/%.1f",
>>>>>> +      $wl_limit);
>>>>>> -sub _check_result {
>>>>>> -  my ($self, $pms, $results) = @_;
>>>>>> -
>>>>>> -  if (!@$results) {
>>>>>> -    dbg("pyzor: no response from server");
>>>>>> -    return 0;
>>>>>> -  }
>>>>>> -
>>>>>> -  my $count = 0;
>>>>>> -  my $count_wl = 0;
>>>>>> -  foreach my $res (@$results) {
>>>>>> -    chomp($res);
>>>>>> -    if ($res =~ /^Traceback/) {
>>>>>> -      info("pyzor: internal error, python traceback seen in response: $res");
>>>>>> +    # Empty body etc results in same hash, we should skip very large numbers..
>>>>>> +    if ($pyzor_count >= 1000000 || $pyzor_whitelisted >= 10000) {
>>>>>> +      dbg("pyzor: result exceeded hardcoded limits, ignoring: count/wl 1000000/10000");
>>>>>>         return 0;
>>>>>>       }
>>>>>> -    dbg("pyzor: got response: $res");
>>>>>> -    # this regexp is intended to be a little bit forgiving
>>>>>> -    if ($res =~ /^\S+\t.*?\t(\d+)\t(\d+)\s*$/) {
>>>>>> -      # until pyzor servers can sync their DBs,
>>>>>> -      # sum counts obtained from all servers
>>>>>> -      $count += untaint_var($1)+0; # crazy but needs untainting
>>>>>> -      $count_wl += untaint_var($2)+0;
>>>>>> -    } else {
>>>>>> -      # warn on failures to parse
>>>>>> -      info("pyzor: failure to parse response \"$res\"");
>>>>>> -    }
>>>>>> -  }
>>>>>> -
>>>>>> -  my $conf = $self->{main}->{conf};
>>>>>> -
>>>>>> -  my $count_min = $conf->{pyzor_count_min};
>>>>>> -  my $wl_min = $conf->{pyzor_whitelist_min};
>>>>>> -  my $wl_limit = $count_wl >= $wl_min ?
>>>>>> -    $count * $conf->{pyzor_whitelist_factor} : 0;
>>>>>> -
>>>>>> -  dbg("pyzor: result: COUNT=$count/$count_min WHITELIST=$count_wl/$wl_min/%.1f",
>>>>>> -    $wl_limit);
>>>>>> -  $pms->set_tag('PYZOR', "Reported $count times, whitelisted $count_wl times.");
>>>>>> -
>>>>>> -  # Empty body etc results in same hash, we should skip very large numbers..
>>>>>> -  if ($count >= 1000000 || $count_wl >= 10000) {
>>>>>> -    dbg("pyzor: result exceeded hardcoded limits, ignoring: count/wl 1000000/10000");
>>>>>> -    return 0;
>>>>>> -  }
>>>>>> -
>>>>>> -  # Whitelisted?
>>>>>> -  if ($wl_limit && $count_wl >= $wl_limit) {
>>>>>> -    dbg("pyzor: message whitelisted");
>>>>>> -    return 0;
>>>>>> -  }
>>>>>> +    # Whitelisted?
>>>>>> +    if ($wl_limit && $pyzor_whitelisted >= $wl_limit) {
>>>>>> +      dbg("pyzor: message whitelisted");
>>>>>> +      return 0;
>>>>>> +    }
>>>>>> -  if ($count >= $count_min) {
>>>>>> -    if ($conf->{pyzor_fork}) {
>>>>>> -      # forked needs to run got_hit()
>>>>>> -      $pms->got_hit($pms->{pyzor_rulename}, "", ruletype => 'eval');
>>>>>> +    if ( $pyzor_count >= $count_min ) {
>>>>>> +      return 1;
>>>>>>       }
>>>>>> -    return 1;
>>>>>> -  }
>>>>>> -  return 0;
>>>>>> +    return 0;
>>>>>>   }
>>>>>>   sub plugin_report {
>>>>>>     my ($self, $options) = @_;
>>>>>> -  return if !$self->{pyzor_available};
>>>>>> -  return if !$self->{main}->{conf}->{use_pyzor};
>>>>>> -  return if $options->{report}->{options}->{dont_report_to_pyzor};
>>>>>> -  return if !$self->is_pyzor_available();
>>>>>> -
>>>>>> -  # use temporary file: open2() is unreliable due to buffering under spamd
>>>>>> -  my $tmpf = $options->{report}->create_fulltext_tmpfile($options->{text});
>>>>>> -  if ($self->pyzor_report($options, $tmpf)) {
>>>>>> -    $options->{report}->{report_available} = 1;
>>>>>> -    info("reporter: spam reported to Pyzor");
>>>>>> -    $options->{report}->{report_return} = 1;
>>>>>> -  }
>>>>>> -  else {
>>>>>> -    info("reporter: could not report spam to Pyzor");
>>>>>> -  }
>>>>>> -  $options->{report}->delete_fulltext_tmpfile($tmpf);
>>>>>> +  return unless $self->{pyzor_available};
>>>>>> +  return unless $self->{main}->{conf}->{use_pyzor};
>>>>>> -  return 1;
>>>>>> +  if (!$options->{report}->{options}->{dont_report_to_pyzor} && $self->is_pyzor_available())
>>>>>> +  {
>>>>>> +    if ($self->pyzor_report($options)) {
>>>>>> +      $options->{report}->{report_available} = 1;
>>>>>> +      info("reporter: spam reported to Pyzor");
>>>>>> +      $options->{report}->{report_return} = 1;
>>>>>> +    }
>>>>>> +    else {
>>>>>> +      info("reporter: could not report spam to Pyzor");
>>>>>> +    }
>>>>>> +  }
>>>>>>   }
>>>>>>   sub pyzor_report {
>>>>>> -  my ($self, $options, $tmpf) = @_;
>>>>>> -
>>>>>> -  # note: not really tainted, this came from system configuration file
>>>>>> -  my $path = untaint_file_path($options->{report}->{conf}->{pyzor_path});
>>>>>> -  my $opts = untaint_var($options->{report}->{conf}->{pyzor_options}) || '';
>>>>>> +    my ( $self, $options ) = @_;
>>>>>> -  my $timeout = $self->{main}->{conf}->{pyzor_timeout};
>>>>>> +    my $timeout = $self->{main}->{conf}->{pyzor_timeout};
>>>>>> -  $options->{report}->enter_helper_run_mode();
>>>>>> +    my $client = ( $self->{'_pyzor_client'} ||= Mail::SpamAssassin::Pyzor::Client->new( 'timeout' => $timeout ) );
>>>>>> -  my $timer = Mail::SpamAssassin::Timeout->new({ secs => $timeout });
>>>>>> -  my $err = $timer->run_and_catch(sub {
>>>>>> +    my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $options->{'text'} );
>>>>>> -    local $SIG{PIPE} = sub { die "__brokenpipe__ignore__\n" };
>>>>>> -
>>>>>> -    dbg("pyzor: opening pipe: " . join(' ', $path, $opts, "report", "< $tmpf"));
>>>>>> -
>>>>>> -    my $pid = Mail::SpamAssassin::Util::helper_app_pipe_open(*PYZOR,
>>>>>> -	$tmpf, 1, $path, split(' ', $opts), "report");
>>>>>> -    $pid or die "$!\n";
>>>>>> -
>>>>>> -    my($inbuf,$nread,$nread_all); $nread_all = 0;
>>>>>> -    # response is ignored, just check its existence
>>>>>> -    while ( $nread=read(PYZOR,$inbuf,8192) ) { $nread_all += $nread }
>>>>>> -    defined $nread  or die "error reading from pipe: $!";
>>>>>> -
>>>>>> -    dbg("pyzor: empty response")  if $nread_all < 1;
>>>>>> -
>>>>>> -    my $errno = 0;  close PYZOR or $errno = $!;
>>>>>> -    # closing a pipe also waits for the process executing on the pipe to
>>>>>> -    # complete, no need to explicitly call waitpid
>>>>>> -    # my $child_stat = waitpid($pid,0) > 0 ? $? : undef;
>>>>>> -    if (proc_status_ok($?,$errno, 0)) {
>>>>>> -      dbg("pyzor: [%s] reporter finished successfully", $pid);
>>>>>> -    } else {
>>>>>> -      info("pyzor: [%s] reporter error: %s", $pid, exit_status_str($?,$errno));
>>>>>> +    local $@;
>>>>>> +    my $ref = eval { $client->report($digest); };
>>>>>> +    if ($@) {
>>>>>> +        warn("pyzor: report failed: $@");
>>>>>> +        return 0;
>>>>>>       }
>>>>>> -
>>>>>> -  });
>>>>>> -
>>>>>> -  $options->{report}->leave_helper_run_mode();
>>>>>> -
>>>>>> -  if ($timer->timed_out()) {
>>>>>> -    dbg("reporter: pyzor report timed out after $timeout seconds");
>>>>>> -    return 0;
>>>>>> -  }
>>>>>> -
>>>>>> -  if ($err) {
>>>>>> -    chomp $err;
>>>>>> -    if ($err eq '__brokenpipe__ignore__') {
>>>>>> -      dbg("reporter: pyzor report failed: broken pipe");
>>>>>> -    } else {
>>>>>> -      warn("reporter: pyzor report failed: $err\n");
>>>>>> +    elsif ( $ref->{'Code'} ne 200 ) {
>>>>>> +        dbg("pyzor: report failed with invalid code: $ref->{'Code'}: $ref->{'Diag'}");
>>>>>> +        return 0;
>>>>>>       }
>>>>>> -    return 0;
>>>>>> -  }
>>>>>> -  return 1;
>>>>>> +    return 1;
>>>>>>   }
>>>>>> -# Version features
>>>>>> -sub has_fork { 1 }
>>>>>> -
>>>>>>   1;
>>>>>> -
>>>>>> -=back
>>>>>> -
>>>>>> -=cut
>>>>>> diff --git a/lib/Mail/SpamAssassin/Pyzor.pm b/lib/Mail/SpamAssassin/Pyzor.pm
>>>>>> new file mode 100644
>>>>>> index 0000000..8ac27f4
>>>>>> --- /dev/null
>>>>>> +++ b/lib/Mail/SpamAssassin/Pyzor.pm
>>>>>> @@ -0,0 +1,56 @@
>>>>>> +package Mail::SpamAssassin::Pyzor;
>>>>>> +
>>>>>> +# Copyright 2018 cPanel, LLC.
>>>>>> +# All rights reserved.
>>>>>> +# http://cpanel.net
>>>>>> +#
>>>>>> +# <@LICENSE>
>>>>>> +# Licensed to the Apache Software Foundation (ASF) under one or more
>>>>>> +# contributor license agreements.  See the NOTICE file distributed with
>>>>>> +# this work for additional information regarding copyright ownership.
>>>>>> +# The ASF licenses this file to you under the Apache License, Version 2.0
>>>>>> +# (the "License"); you may not use this file except in compliance with
>>>>>> +# the License.  You may obtain a copy of the License at:
>>>>>> +#
>>>>>> +#     http://www.apache.org/licenses/LICENSE-2.0
>>>>>> +#
>>>>>> +# Unless required by applicable law or agreed to in writing, software
>>>>>> +# distributed under the License is distributed on an "AS IS" BASIS,
>>>>>> +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>>>>>> +# See the License for the specific language governing permissions and
>>>>>> +# limitations under the License.
>>>>>> +# </...@LICENSE>
>>>>>> +#
>>>>>> +
>>>>>> +use strict;
>>>>>> +use warnings;
>>>>>> +
>>>>>> +our $VERSION = '0.06_01';
>>>>>> +
>>>>>> +=encoding utf-8
>>>>>> +
>>>>>> +=head1 NAME
>>>>>> +
>>>>>> +Mail::SpamAssassin::Pyzor - Pyzor spam filtering in Perl
>>>>>> +
>>>>>> +=head1 DESCRIPTION
>>>>>> +
>>>>>> +This distribution contains Perl implementations of parts of
>>>>>> +L<Pyzor|http://pyzor.org>, a tool for use in spam email filtering.
>>>>>> +It is intended for use with L<Mail::SpamAssassin> but may be useful
>>>>>> +in other contexts.
>>>>>> +
>>>>>> +See the following modules for information on specific tools that
>>>>>> +the distribution includes:
>>>>>> +
>>>>>> +=over
>>>>>> +
>>>>>> +=item * L<Mail::SpamAssassin::Pyzor::Client>
>>>>>> +
>>>>>> +=item * L<Mail::SpamAssassin::Pyzor::Digest>
>>>>>> +
>>>>>> +=back
>>>>>> +
>>>>>> +=cut
>>>>>> +
>>>>>> +1;
>>>>>> diff --git a/lib/Mail/SpamAssassin/Pyzor/Client.pm b/lib/Mail/SpamAssassin/Pyzor/Client.pm
>>>>>> new file mode 100644
>>>>>> index 0000000..ccff868
>>>>>> --- /dev/null
>>>>>> +++ b/lib/Mail/SpamAssassin/Pyzor/Client.pm
>>>>>> @@ -0,0 +1,415 @@
>>>>>> +package Mail::SpamAssassin::Pyzor::Client;
>>>>>> +
>>>>>> +# Copyright 2018 cPanel, LLC.
>>>>>> +# All rights reserved.
>>>>>> +# http://cpanel.net
>>>>>> +#
>>>>>> +# <@LICENSE>
>>>>>> +# Licensed to the Apache Software Foundation (ASF) under one or more
>>>>>> +# contributor license agreements.  See the NOTICE file distributed with
>>>>>> +# this work for additional information regarding copyright ownership.
>>>>>> +# The ASF licenses this file to you under the Apache License, Version 2.0
>>>>>> +# (the "License"); you may not use this file except in compliance with
>>>>>> +# the License.  You may obtain a copy of the License at:
>>>>>> +#
>>>>>> +#     http://www.apache.org/licenses/LICENSE-2.0
>>>>>> +#
>>>>>> +# Unless required by applicable law or agreed to in writing, software
>>>>>> +# distributed under the License is distributed on an "AS IS" BASIS,
>>>>>> +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>>>>>> +# See the License for the specific language governing permissions and
>>>>>> +# limitations under the License.
>>>>>> +# </...@LICENSE>
>>>>>> +#
>>>>>> +
>>>>>> +use strict;
>>>>>> +use warnings;
>>>>>> +
>>>>>> +=encoding utf-8
>>>>>> +
>>>>>> +=head1 NAME
>>>>>> +
>>>>>> +Mail::SpamAssassin::Pyzor::Client - Pyzor client logic
>>>>>> +
>>>>>> +=head1 SYNOPSIS
>>>>>> +
>>>>>> +    use Mail::SpamAssassin::Pyzor::Client ();
>>>>>> +    use Mail::SpamAssassin::Pyzor::Digest ();
>>>>>> +
>>>>>> +    my $client = Mail::SpamAssassin::Pyzor::Client->new();
>>>>>> +
>>>>>> +    my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $msg );
>>>>>> +
>>>>>> +    my $check_ref = $client->check($digest);
>>>>>> +    die $check_ref->{'Diag'} if $check_ref->{'Code'} ne '200';
>>>>>> +
>>>>>> +    my $report_ref = $client->report($digest);
>>>>>> +    die $report_ref->{'Diag'} if $report_ref->{'Code'} ne '200';
>>>>>> +
>>>>>> +=head1 DESCRIPTION
>>>>>> +
>>>>>> +A bare-bones L<Pyzor|http://pyzor.org> client that currently only
>>>>>> +implements the functionality needed for L<Mail::SpamAssassin>.
>>>>>> +
>>>>>> +=head1 PROTOCOL DETAILS
>>>>>> +
>>>>>> +The Pyzor protocol is not a published standard, and there appears to be
>>>>>> +no meaningful public documentation. What follows is enough information,
>>>>>> +largely gleaned through forum posts and reverse engineering, to facilitate
>>>>>> +effective use of this module:
>>>>>> +
>>>>>> +Pyzor is an RPC-oriented, message-based protocol. Each message
>>>>>> +is a simple dictionary of 7-bit ASCII keys and values. Server responses
>>>>>> +always include at least the following:
>>>>>> +
>>>>>> +=over
>>>>>> +
>>>>>> +=item * C<Code> - Similar to HTTP status codes; anything besides C<200>
>>>>>> +is an error.
>>>>>> +
>>>>>> +=item * C<Diag> - Similar to HTTP status reasons: a text description
>>>>>> +of the status.
>>>>>> +
>>>>>> +=back
>>>>>> +
>>>>>> +(NB: There are additional standard response headers that are useful only for
>>>>>> +the protocol itself and thus are not part of this module???s returns.)
>>>>>> +
>>>>>> +=head2 Reliability
>>>>>> +
>>>>>> +Pyzor uses UDP rather than TCP, so no message is guaranteed to reach its
>>>>>> +destination. A transmission failure can happen in either the request or
>>>>>> +the response; in either case, a timeout error will result. Such errors
>>>>>> +are represented as thrown instances of L<Mail::Pyzor::X::Timeout>.
>>>>>> +
>>>>>> +=cut
>>>>>> +
>>>>>> +#----------------------------------------------------------------------
>>>>>> +
>>>>>> +our $VERSION = '0.04';
>>>>>> +
>>>>>> +our $DEFAULT_SERVER_HOST    = 'public.pyzor.org';
>>>>>> +our $DEFAULT_SERVER_PORT    = 24441;
>>>>>> +our $DEFAULT_USERNAME       = 'anonymous';
>>>>>> +our $DEFAULT_PASSWORD       = '';
>>>>>> +our $DEFAULT_OP_SPEC        = '20,3,60,3';
>>>>>> +our $PYZOR_PROTOCOL_VERSION = 2.1;
>>>>>> +our $DEFAULT_TIMEOUT        = 3.5;
>>>>>> +our $READ_SIZE              = 8192;
>>>>>> +
>>>>>> +use IO::Socket::INET ();
>>>>>> +use Digest::SHA qw(sha1 sha1_hex);
>>>>>> +
>>>>>> +my @hash_order = ( 'Op', 'Op-Digest', 'Op-Spec', 'Thread', 'PV', 'User', 'Time', 'Sig' );
>>>>>> +
>>>>>> +#----------------------------------------------------------------------
>>>>>> +
>>>>>> +=head1 CONSTRUCTOR
>>>>>> +
>>>>>> +=head2 new(%OPTS)
>>>>>> +
>>>>>> +Create a new pyzor client.
>>>>>> +
>>>>>> +=over 2
>>>>>> +
>>>>>> +=item Input
>>>>>> +
>>>>>> +%OPTS are (all optional):
>>>>>> +
>>>>>> +=over 3
>>>>>> +
>>>>>> +=item * C<server_host> - The pyzor server host to connect to (default is
>>>>>> +C<public.pyzor.org>)
>>>>>> +
>>>>>> +=item * C<server_port> - The pyzor server port to connect to (default is
>>>>>> +24441)
>>>>>> +
>>>>>> +=item * C<username> - The username to present to the pyzor server (default
>>>>>> +is C<anonymous>)
>>>>>> +
>>>>>> +=item * C<password> - The password to present to the pyzor server (default
>>>>>> +is empty)
>>>>>> +
>>>>>> +=item * C<timeout> - The maximum time, in seconds, to wait for a response
>>>>>> +from the pyzor server (defeault is 3.5)
>>>>>> +
>>>>>> +=back
>>>>>> +
>>>>>> +=item Output
>>>>>> +
>>>>>> +=over 3
>>>>>> +
>>>>>> +Returns a L<Mail::SpamAssassin::Pyzor::Client> object.
>>>>>> +
>>>>>> +=back
>>>>>> +
>>>>>> +=back
>>>>>> +
>>>>>> +=cut
>>>>>> +
>>>>>> +sub new {
>>>>>> +    my ( $class, %OPTS ) = @_;
>>>>>> +
>>>>>> +    return bless {
>>>>>> +        '_server_host' => $OPTS{'server_host'} || $DEFAULT_SERVER_HOST,
>>>>>> +        '_server_port' => $OPTS{'server_port'} || $DEFAULT_SERVER_PORT,
>>>>>> +        '_username'    => $OPTS{'username'}    || $DEFAULT_USERNAME,
>>>>>> +        '_password'    => $OPTS{'password'}    || $DEFAULT_PASSWORD,
>>>>>> +        '_op_spec'     => $DEFAULT_OP_SPEC,
>>>>>> +        '_timeout'     => $OPTS{'timeout'} || $DEFAULT_TIMEOUT,
>>>>>> +    }, $class;
>>>>>> +}
>>>>>> +
>>>>>> +#----------------------------------------------------------------------
>>>>>> +
>>>>>> +=head1 REQUEST METHODS
>>>>>> +
>>>>>> +=head2 report($digest)
>>>>>> +
>>>>>> +Report the digest of a spam message to the pyzor server. This function
>>>>>> +will throw if a messaging failure or timeout happens.
>>>>>> +
>>>>>> +=over 2
>>>>>> +
>>>>>> +=item Input
>>>>>> +
>>>>>> +=over 3
>>>>>> +
>>>>>> +=item $digest C<SCALAR>
>>>>>> +
>>>>>> +The message digest to report, as given by
>>>>>> +C<Mail::SpamAssassin::Pyzor::Digest::get()>.
>>>>>> +
>>>>>> +=back
>>>>>> +
>>>>>> +=item Output
>>>>>> +
>>>>>> +=over 3
>>>>>> +
>>>>>> +=item C<HASHREF>
>>>>>> +
>>>>>> +Returns a hashref of the standard attributes noted above.
>>>>>> +
>>>>>> +=back
>>>>>> +
>>>>>> +=back
>>>>>> +
>>>>>> +=cut
>>>>>> +
>>>>>> +sub report {
>>>>>> +    my ( $self, $digest ) = @_;
>>>>>> +
>>>>>> +    my $msg_ref = $self->_get_base_msg( 'report', $digest );
>>>>>> +
>>>>>> +    $msg_ref->{'Op-Spec'} = $self->{'_op_spec'};
>>>>>> +
>>>>>> +    return $self->_send_receive_msg($msg_ref);
>>>>>> +}
>>>>>> +
>>>>>> +=head2 check($digest)
>>>>>> +
>>>>>> +Check the digest of a message to see if
>>>>>> +the pyzor server has a report for it. This function
>>>>>> +will throw if a messaging failure or timeout happens.
>>>>>> +
>>>>>> +=over 2
>>>>>> +
>>>>>> +=item Input
>>>>>> +
>>>>>> +=over 3
>>>>>> +
>>>>>> +=item $digest C<SCALAR>
>>>>>> +
>>>>>> +The message digest to check, as given by
>>>>>> +C<Mail::SpamAssassin::Pyzor::Digest::get()>.
>>>>>> +
>>>>>> +=back
>>>>>> +
>>>>>> +=item Output
>>>>>> +
>>>>>> +=over 3
>>>>>> +
>>>>>> +=item C<HASHREF>
>>>>>> +
>>>>>> +Returns a hashref of the standard attributes noted above
>>>>>> +as well as the following:
>>>>>> +
>>>>>> +=over
>>>>>> +
>>>>>> +=item * C<Count> - The number of reports the server has received
>>>>>> +for the given digest.
>>>>>> +
>>>>>> +=item * C<WL-Count> - The number of whitelist requests the server has received
>>>>>> +for the given digest.
>>>>>> +
>>>>>> +=back
>>>>>> +
>>>>>> +=back
>>>>>> +
>>>>>> +=back
>>>>>> +
>>>>>> +=cut
>>>>>> +
>>>>>> +sub check {
>>>>>> +    my ( $self, $digest ) = @_;
>>>>>> +
>>>>>> +    return $self->_send_receive_msg( $self->_get_base_msg( 'check', $digest ) );
>>>>>> +}
>>>>>> +
>>>>>> +# ----------------------------------------
>>>>>> +
>>>>>> +sub _send_receive_msg {
>>>>>> +    my ( $self, $msg_ref ) = @_;
>>>>>> +
>>>>>> +    my $thread_id = $msg_ref->{'Thread'} or warn 'No thread ID?';
>>>>>> +
>>>>>> +    $self->_sign_msg($msg_ref);
>>>>>> +
>>>>>> +    return $self->_do_send_receive(
>>>>>> +        $self->_generate_packet_from_message($msg_ref) . "\n\n",
>>>>>> +        $thread_id,
>>>>>> +    );
>>>>>> +}
>>>>>> +
>>>>>> +sub _get_base_msg {
>>>>>> +    my ( $self, $op, $digest ) = @_;
>>>>>> +
>>>>>> +    die "Implementor error: op is required" if !$op;
>>>>>> +    die "error: digest is required"         if !$digest;
>>>>>> +
>>>>>> +    return {
>>>>>> +        'User'      => $self->{'_username'},
>>>>>> +        'PV'        => $PYZOR_PROTOCOL_VERSION,
>>>>>> +        'Time'      => time(),
>>>>>> +        'Op'        => $op,
>>>>>> +        'Op-Digest' => $digest,
>>>>>> +        'Thread'    => $self->_generate_thread_id()
>>>>>> +    };
>>>>>> +}
>>>>>> +
>>>>>> +sub _do_send_receive {
>>>>>> +    my ( $self, $packet, $thread_id ) = @_;
>>>>>> +
>>>>>> +    my $sock = $self->_get_connection_or_die();
>>>>>> +
>>>>>> +    $self->_send_packet( $sock, $packet );
>>>>>> +    my $response = $self->_receive_packet( $sock, $thread_id );
>>>>>> +
>>>>>> +    return 0 if not defined $response;
>>>>>> +
>>>>>> +    my $resp_hr = { map { ( split(m{: }) )[ 0, 1 ] } split( m{\n}, $response ) };
>>>>>> +
>>>>>> +    delete $resp_hr->{'Thread'};
>>>>>> +
>>>>>> +    my $response_pv = delete $resp_hr->{'PV'};
>>>>>> +
>>>>>> +    if ( $PYZOR_PROTOCOL_VERSION ne $response_pv ) {
>>>>>> +        warn "Unexpected protocol version ($response_pv) in Pyzor response!";
>>>>>> +    }
>>>>>> +
>>>>>> +    return $resp_hr;
>>>>>> +}
>>>>>> +
>>>>>> +sub _receive_packet {
>>>>>> +    my ( $self, $sock, $thread_id ) = @_;
>>>>>> +
>>>>>> +    my $timeout = $self->{'_timeout'} * 1000;
>>>>>> +
>>>>>> +    my $end_time = time + $self->{'_timeout'};
>>>>>> +
>>>>>> +    $sock->blocking(0);
>>>>>> +    my $response = '';
>>>>>> +    my $rout     = '';
>>>>>> +    my $rin      = '';
>>>>>> +    vec( $rin, fileno($sock), 1 ) = 1;
>>>>>> +
>>>>>> +    while (1) {
>>>>>> +        my $time_left = $end_time - time;
>>>>>> +
>>>>>> +        if ( $time_left <= 0 ) {
>>>>>> +          warn("Did not receive a response from the pyzor server $self->{'_server_host'}:$self->{'_server_port'} for $self->{'_timeout'} seconds!");
>>>>>> +          return;
>>>>>> +        }
>>>>>> +
>>>>>> +        my $bytes = sysread( $sock, $response, $READ_SIZE, length $response );
>>>>>> +        if ( !defined($bytes) && !$!{'EAGAIN'} && !$!{'EWOULDBLOCK'} ) {
>>>>>> +            warn "read from socket: $!";
>>>>>> +        }
>>>>>> +
>>>>>> +        if ( index( $response, "\n\n" ) > -1 ) {
>>>>>> +
>>>>>> +            # Reject the response unless its thread ID matches what we sent.
>>>>>> +            # This prevents confusion among concurrent Pyzor reqeusts.
>>>>>> +            if ( index( $response, "\nThread: $thread_id\n" ) != -1 ) {
>>>>>> +                last;
>>>>>> +            }
>>>>>> +            else {
>>>>>> +                $response = '';
>>>>>> +            }
>>>>>> +        }
>>>>>> +
>>>>>> +        my $found = select( $rout = $rin, undef, undef, $time_left );
>>>>>> +        warn "select(): $!" if $found == -1;
>>>>>> +    }
>>>>>> +
>>>>>> +    return $response;
>>>>>> +}
>>>>>> +
>>>>>> +sub _send_packet {
>>>>>> +    my ( $self, $sock, $packet ) = @_;
>>>>>> +
>>>>>> +    $sock->blocking(1);
>>>>>> +    syswrite( $sock, $packet ) or warn "write to socket: $!";
>>>>>> +
>>>>>> +    return;
>>>>>> +}
>>>>>> +
>>>>>> +sub _get_connection_or_die {
>>>>>> +    my ($self) = @_;
>>>>>> +
>>>>>> +    # clear the socket if the PID changes
>>>>>> +    if ( defined $self->{'_sock_pid'} && $self->{'_sock_pid'} != $$ ) {
>>>>>> +        undef $self->{'_sock_pid'};
>>>>>> +        undef $self->{'_sock'};
>>>>>> +    }
>>>>>> +
>>>>>> +    $self->{'_sock_pid'} ||= $$;
>>>>>> +    $self->{'_sock'}     ||= IO::Socket::INET->new(
>>>>>> +        'PeerHost' => $self->{'_server_host'},
>>>>>> +        'PeerPort' => $self->{'_server_port'},
>>>>>> +        'Proto'    => 'udp'
>>>>>> +    ) or die "Cannot connect to $self->{'_server_host'}:$self->{'_server_port'}: $@ $!";
>>>>>> +
>>>>>> +    return $self->{'_sock'};
>>>>>> +}
>>>>>> +
>>>>>> +sub _sign_msg {
>>>>>> +    my ( $self, $msg_ref ) = @_;
>>>>>> +
>>>>>> +    $msg_ref->{'Sig'} = lc Digest::SHA::sha1_hex(
>>>>>> +        Digest::SHA::sha1( $self->_generate_packet_from_message($msg_ref) )
>>>>>> +    );
>>>>>> +
>>>>>> +    return 1;
>>>>>> +}
>>>>>> +
>>>>>> +sub _generate_packet_from_message {
>>>>>> +    my ( $self, $msg_ref ) = @_;
>>>>>> +
>>>>>> +    return join( "\n", map { "$_: $msg_ref->{$_}" } grep { length $msg_ref->{$_} } @hash_order );
>>>>>> +}
>>>>>> +
>>>>>> +sub _generate_thread_id {
>>>>>> +    my $RAND_MAX = 2**16;
>>>>>> +    my $val      = 0;
>>>>>> +    $val = int rand($RAND_MAX) while $val < 1024;
>>>>>> +    return $val;
>>>>>> +}
>>>>>> +
>>>>>> +sub _get_user_pass_hash_key {
>>>>>> +    my ($self) = @_;
>>>>>> +
>>>>>> +    return lc Digest::SHA::sha1_hex( $self->{'_username'} . ':' . $self->{'_password'} );
>>>>>> +}
>>>>>> +
>>>>>> +1;
>>>>>> diff --git a/lib/Mail/SpamAssassin/Pyzor/Digest.pm b/lib/Mail/SpamAssassin/Pyzor/Digest.pm
>>>>>> new file mode 100644
>>>>>> index 0000000..0e8a5ae
>>>>>> --- /dev/null
>>>>>> +++ b/lib/Mail/SpamAssassin/Pyzor/Digest.pm
>>>>>> @@ -0,0 +1,103 @@
>>>>>> +package Mail::SpamAssassin::Pyzor::Digest;
>>>>>> +
>>>>>> +# Copyright 2018 cPanel, LLC.
>>>>>> +# All rights reserved.
>>>>>> +# http://cpanel.net
>>>>>> +#
>>>>>> +# <@LICENSE>
>>>>>> +# Licensed to the Apache Software Foundation (ASF) under one or more
>>>>>> +# contributor license agreements.  See the NOTICE file distributed with
>>>>>> +# this work for additional information regarding copyright ownership.
>>>>>> +# The ASF licenses this file to you under the Apache License, Version 2.0
>>>>>> +# (the "License"); you may not use this file except in compliance with
>>>>>> +# the License.  You may obtain a copy of the License at:
>>>>>> +#
>>>>>> +#     http://www.apache.org/licenses/LICENSE-2.0
>>>>>> +#
>>>>>> +# Unless required by applicable law or agreed to in writing, software
>>>>>> +# distributed under the License is distributed on an "AS IS" BASIS,
>>>>>> +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>>>>>> +# See the License for the specific language governing permissions and
>>>>>> +# limitations under the License.
>>>>>> +# </...@LICENSE>
>>>>>> +#
>>>>>> +
>>>>>> +use strict;
>>>>>> +use warnings;
>>>>>> +
>>>>>> +=encoding utf-8
>>>>>> +
>>>>>> +=head1 NAME
>>>>>> +
>>>>>> +Mail::SpamAssassin::Pyzor::Digest
>>>>>> +
>>>>>> +=head1 SYNOPSIS
>>>>>> +
>>>>>> +    my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $mime_text );
>>>>>> +
>>>>>> +=head1 DESCRIPTION
>>>>>> +
>>>>>> +A reimplementation of L<https://github.com/SpamExperts/pyzor/blob/master/pyzor/digest.py>.
>>>>>> +
>>>>>> +=cut
>>>>>> +
>>>>>> +#----------------------------------------------------------------------
>>>>>> +
>>>>>> +use Email::MIME ();
>>>>>> +
>>>>>> +use Mail::SpamAssassin::Pyzor::Digest::Pieces ();
>>>>>> +use Digest::SHA qw(sha1_hex);
>>>>>> +
>>>>>> +our $VERSION = '0.03';
>>>>>> +
>>>>>> +#----------------------------------------------------------------------
>>>>>> +
>>>>>> +=head1 FUNCTIONS
>>>>>> +
>>>>>> +=head2 $hex = get( $MSG )
>>>>>> +
>>>>>> +This takes an email message in raw MIME text format (i.e., as saved in the
>>>>>> +standard mbox format) and returns the message???s Pyzor digest in lower-case
>>>>>> +hexadecimal.
>>>>>> +
>>>>>> +The output from this function should normally be identical to that of
>>>>>> +the C<pyzor> script???s C<digest> command. It is suitable for use in
>>>>>> +L<Mail::SpamAssassin::Pyzor::Client>???s request methods.
>>>>>> +
>>>>>> +=cut
>>>>>> +
>>>>>> +sub get {
>>>>>> +    my ($text) = @_;
>>>>>> +    return Digest::SHA::sha1_hex( ${ _get_predigest( $text ) } );
>>>>>> +}
>>>>>> +
>>>>>> +# NB: This is called from the test.
>>>>>> +sub _get_predigest {    ## no critic qw(RequireArgUnpacking)
>>>>>> +    my ($msg_text_sr) = @_;
>>>>>> +
>>>>>> +    my $parsed = Email::MIME->new($$msg_text_sr);
>>>>>> +
>>>>>> +    my @lines;
>>>>>> +
>>>>>> +    my $payloads_ar = Mail::SpamAssassin::Pyzor::Digest::Pieces::digest_payloads($parsed);
>>>>>> +
>>>>>> +    for my $payload (@$payloads_ar) {
>>>>>> +        my @p_lines = Mail::SpamAssassin::Pyzor::Digest::Pieces::splitlines($payload);
>>>>>> +        for my $line (@p_lines) {
>>>>>> +            Mail::SpamAssassin::Pyzor::Digest::Pieces::normalize($line);
>>>>>> +
>>>>>> +            next if !Mail::SpamAssassin::Pyzor::Digest::Pieces::should_handle_line($line);
>>>>>> +
>>>>>> +            # Make sure we have an octet string.
>>>>>> +            utf8::encode($line) if utf8::is_utf8($line);
>>>>>> +
>>>>>> +            push @lines, $line;
>>>>>> +        }
>>>>>> +    }
>>>>>> +
>>>>>> +    my $digest_sr = Mail::SpamAssassin::Pyzor::Digest::Pieces::assemble_lines( \@lines );
>>>>>> +
>>>>>> +    return $digest_sr;
>>>>>> +}
>>>>>> +
>>>>>> +1;
>>>>>> diff --git a/lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm b/lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm
>>>>>> new file mode 100644
>>>>>> index 0000000..522accd
>>>>>> --- /dev/null
>>>>>> +++ b/lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm
>>>>>> @@ -0,0 +1,301 @@
>>>>>> +package Mail::SpamAssassin::Pyzor::Digest::Pieces;
>>>>>> +
>>>>>> +# Copyright 2018 cPanel, LLC.
>>>>>> +# All rights reserved.
>>>>>> +# http://cpanel.net
>>>>>> +#
>>>>>> +# <@LICENSE>
>>>>>> +# Licensed to the Apache Software Foundation (ASF) under one or more
>>>>>> +# contributor license agreements.  See the NOTICE file distributed with
>>>>>> +# this work for additional information regarding copyright ownership.
>>>>>> +# The ASF licenses this file to you under the Apache License, Version 2.0
>>>>>> +# (the "License"); you may not use this file except in compliance with
>>>>>> +# the License.  You may obtain a copy of the License at:
>>>>>> +#
>>>>>> +#     http://www.apache.org/licenses/LICENSE-2.0
>>>>>> +#
>>>>>> +# Unless required by applicable law or agreed to in writing, software
>>>>>> +# distributed under the License is distributed on an "AS IS" BASIS,
>>>>>> +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>>>>>> +# See the License for the specific language governing permissions and
>>>>>> +# limitations under the License.
>>>>>> +# </...@LICENSE>
>>>>>> +#
>>>>>> +
>>>>>> +use strict;
>>>>>> +use warnings;
>>>>>> +
>>>>>> +=encoding utf-8
>>>>>> +
>>>>>> +=head1 NAME
>>>>>> +
>>>>>> +Mail::SpamAssassin::Pyzor::Digest::Pieces
>>>>>> +
>>>>>> +=head1 DESCRIPTION
>>>>>> +
>>>>>> +This module houses backend logic for L<Mail::SpamAssassin::Pyzor::Digest>.
>>>>>> +
>>>>>> +It reimplements logic found in pyzor???s F<digest.py> module
>>>>>> +(L<https://github.com/SpamExperts/pyzor/blob/master/pyzor/digest.py>).
>>>>>> +
>>>>>> +=cut
>>>>>> +
>>>>>> +#----------------------------------------------------------------------
>>>>>> +
>>>>>> +use Email::MIME::ContentType ();
>>>>>> +use Encode                   ();
>>>>>> +
>>>>>> +our $VERSION = '0.03';
>>>>>> +
>>>>>> +# each tuple is [ offset, length ]
>>>>>> +use constant _HASH_SPEC => ( [ 20, 3 ], [ 60, 3 ] );
>>>>>> +
>>>>>> +use constant {
>>>>>> +    _MIN_LINE_LENGTH => 8,
>>>>>> +
>>>>>> +    _ATOMIC_NUM_LINES => 4,
>>>>>> +};
>>>>>> +
>>>>>> +#----------------------------------------------------------------------
>>>>>> +
>>>>>> +=head1 FUNCTIONS
>>>>>> +
>>>>>> +=head2 $strings_ar = digest_payloads( $EMAIL_MIME )
>>>>>> +
>>>>>> +This imitates the corresponding object method in F<digest.py>.
>>>>>> +It returns a reference to an array of strings. Each string can be either
>>>>>> +a byte string or a character string (e.g., UTF-8 decoded).
>>>>>> +
>>>>>> +NB: RFC 2822 stipulates that message bodies should use CRLF
>>>>>> +line breaks, not plain LF (nor plain CR). L<Email::MIME::Encodings>
>>>>>> +will thus convert any plain CRs in a quoted-printable message
>>>>>> +body into CRLF. Python, though, doesn???t do this, so the output of
>>>>>> +our implementation of C<digest_payloads()> diverges from that of the Python
>>>>>> +original. It doesn???t ultimately make a difference since the line-ending
>>>>>> +whitespace gets trimmed regardless, but it???s necessary to factor in when
>>>>>> +comparing the output of our implementation with the Python output.
>>>>>> +
>>>>>> +=cut
>>>>>> +
>>>>>> +sub digest_payloads {
>>>>>> +    my ($parsed) = @_;
>>>>>> +
>>>>>> +    my @subparts = $parsed->subparts();
>>>>>> +
>>>>>> +    my @payloads;
>>>>>> +
>>>>>> +    if (@subparts) {
>>>>>> +        @payloads = map { @{ digest_payloads($_) } } $parsed->subparts();
>>>>>> +    }
>>>>>> +    else {
>>>>>> +        my ( $main_type, $subtype, $encoding, $encode_check ) = parse_content_type( $parsed->content_type() );
>>>>>> +
>>>>>> +        my $payload;
>>>>>> +
>>>>>> +        if ( $main_type eq 'text' ) {
>>>>>> +
>>>>>> +            # Decode transfer encoding, but leave us as a byte string.
>>>>>> +            # Note that this is where Email::MIME converts plain LF to CRLF.
>>>>>> +            $payload = $parsed->body();
>>>>>> +
>>>>>> +            # This does the actual character decoding (i.e., ???charset???).
>>>>>> +            $payload = Encode::decode( $encoding, $payload, $encode_check );
>>>>>> +
>>>>>> +            if ( $subtype eq 'html' ) {
>>>>>> +                require Mail::SpamAssassin::Pyzor::Digest::StripHtml;
>>>>>> +                $payload = Mail::SpamAssassin::Pyzor::Digest::StripHtml::strip($payload);
>>>>>> +            }
>>>>>> +        }
>>>>>> +        else {
>>>>>> +
>>>>>> +            # This does no decoding, even of, e.g., quoted-printable or base64.
>>>>>> +            $payload = $parsed->body_raw();
>>>>>> +        }
>>>>>> +
>>>>>> +        push @payloads, $payload;
>>>>>> +    }
>>>>>> +
>>>>>> +    return \@payloads;
>>>>>> +}
>>>>>> +
>>>>>> +#----------------------------------------------------------------------
>>>>>> +
>>>>>> +=head2 normalize( $STRING )
>>>>>> +
>>>>>> +This imitates the corresponding object method in F<digest.py>.
>>>>>> +It modifies C<$STRING> in-place.
>>>>>> +
>>>>>> +As with the original implementation, if C<$STRING> contains (decoded)
>>>>>> +Unicode characters, those characters will be parsed accordingly. So:
>>>>>> +
>>>>>> +    $str = "123\xc2\xa0";   # [ c2 a0 ] == \u00a0, non-breaking space
>>>>>> +
>>>>>> +    normalize($str);
>>>>>> +
>>>>>> +The above will leave C<$str> alone, but this:
>>>>>> +
>>>>>> +    utf8::decode($str);
>>>>>> +
>>>>>> +    normalize($str);
>>>>>> +
>>>>>> +??? will trim off the last two bytes from C<$str>.
>>>>>> +
>>>>>> +=cut
>>>>>> +
>>>>>> +sub normalize {    ## no critic qw( Subroutines::RequireArgUnpacking )
>>>>>> +
>>>>>> +    # NULs are bad, mm-kay?
>>>>>> +    $_[0] =~ tr<\0><>d;
>>>>>> +
>>>>>> +    # NB: Python???s \s without re.UNICODE is the same as Perl???s \s
>>>>>> +    # with the /a modifier.
>>>>>> +    #
>>>>>> +    # https://docs.python.org/2/library/re.html
>>>>>> +    # https://perldoc.perl.org/perlrecharclass.html#Backslash-sequences
>>>>>> +
>>>>>> +    # Python: re.compile(r'\S{10,}')
>>>>>> +    $_[0] =~ s<\S{10,}><>ag;
>>>>>> +
>>>>>> +    # Python: re.compile(r'\S+@\S+')
>>>>>> +    $_[0] =~ s<\S+ @ \S+><>agx;
>>>>>> +
>>>>>> +    # Python: re.compile(r'[a-z]+:\S+', re.IGNORECASE)
>>>>>> +    $_[0] =~ s<[a-zA-Z]+ : \S+><>agx;
>>>>>> +
>>>>>> +    # (from digest.py ???)
>>>>>> +    # Make sure we do the whitespace last because some of the previous
>>>>>> +    # patterns rely on whitespace.
>>>>>> +    $_[0] =~ tr< \x09-\x0d><>d;
>>>>>> +
>>>>>> +    # This is fun. digest.py???s normalize() does a non-UNICODE whitespace
>>>>>> +    # strip, then calls strip() on the string, which *will* strip Unicode
>>>>>> +    # whitespace from the ends.
>>>>>> +    $_[0] =~ s<\A\s+><>;
>>>>>> +    $_[0] =~ s<\s+\z><>;
>>>>>> +
>>>>>> +    return;
>>>>>> +}
>>>>>> +
>>>>>> +#----------------------------------------------------------------------
>>>>>> +
>>>>>> +=head2 $yn = should_handle_line( $STRING )
>>>>>> +
>>>>>> +This imitates the corresponding object method in F<digest.py>.
>>>>>> +It returns a boolean.
>>>>>> +
>>>>>> +=cut
>>>>>> +
>>>>>> +sub should_handle_line {
>>>>>> +    return $_[0] && length( $_[0] ) >= _MIN_LINE_LENGTH();
>>>>>> +}
>>>>>> +
>>>>>> +#----------------------------------------------------------------------
>>>>>> +
>>>>>> +=head2 $sr = assemble_lines( \@LINES )
>>>>>> +
>>>>>> +This assembles a string buffer out of @LINES. The string is the buffer
>>>>>> +of octets that will be hashed to produce the message digest.
>>>>>> +
>>>>>> +Each member of @LINES is expected to be an B<octet string>, not a
>>>>>> +character string.
>>>>>> +
>>>>>> +=cut
>>>>>> +
>>>>>> +sub assemble_lines {
>>>>>> +    my ($lines_ar) = @_;
>>>>>> +
>>>>>> +    if ( @$lines_ar <= _ATOMIC_NUM_LINES() ) {
>>>>>> +
>>>>>> +        # cf. handle_atomic() in digest.py
>>>>>> +        return \join( q<>, @$lines_ar );
>>>>>> +    }
>>>>>> +
>>>>>> +    #----------------------------------------------------------------------
>>>>>> +    # cf. handle_atomic() in digest.py
>>>>>> +
>>>>>> +    my $str = q<>;
>>>>>> +
>>>>>> +    for my $ofs_len ( _HASH_SPEC() ) {
>>>>>> +        my ( $offset, $length ) = @$ofs_len;
>>>>>> +
>>>>>> +        for my $i ( 0 .. ( $length - 1 ) ) {
>>>>>> +            my $idx = int( $offset * @$lines_ar / 100 ) + $i;
>>>>>> +
>>>>>> +            next if !defined $lines_ar->[$idx];
>>>>>> +
>>>>>> +            $str .= $lines_ar->[$idx];
>>>>>> +        }
>>>>>> +    }
>>>>>> +
>>>>>> +    return \$str;
>>>>>> +}
>>>>>> +
>>>>>> +#----------------------------------------------------------------------
>>>>>> +
>>>>>> +=head2 ($main, $sub, $encoding, $checkval) = parse_content_type( $CONTENT_TYPE )
>>>>>> +
>>>>>> +=cut
>>>>>> +
>>>>>> +use constant _QUOTED_PRINTABLE_NAMES => (
>>>>>> +    "quopri-codec",
>>>>>> +    "quopri",
>>>>>> +    "quoted-printable",
>>>>>> +    "quotedprintable",
>>>>>> +);
>>>>>> +
>>>>>> +# Make Encode::decode() ignore anything that doesn???t fit the
>>>>>> +# given encoding.
>>>>>> +use constant _encode_check_ignore => q<>;
>>>>>> +
>>>>>> +sub parse_content_type {
>>>>>> +    my ($content_type) = @_;
>>>>>> +
>>>>>> +    $Email::MIME::ContentType::STRICT_PARAMS = 0;
>>>>>> +    my $ct_parse = Email::MIME::ContentType::parse_content_type(
>>>>>> +        $content_type,
>>>>>> +    );
>>>>>> +
>>>>>> +    my $main = $ct_parse->{'type'}    || q<>;
>>>>>> +    my $sub  = $ct_parse->{'subtype'} || q<>;
>>>>>> +
>>>>>> +    my $encoding = $ct_parse->{'attributes'}{'charset'};
>>>>>> +
>>>>>> +    my $checkval;
>>>>>> +
>>>>>> +    if ($encoding) {
>>>>>> +
>>>>>> +        # Lower-case everything, convert underscore to dash, and remove NUL.
>>>>>> +        $encoding =~ tr<A-Z_\0><a-z->d;
>>>>>> +
>>>>>> +        # Apparently pyzor accommodates messages that put the transfer
>>>>>> +        # encoding in the Content-Type.
>>>>>> +        if ( grep { $_ eq $encoding } _QUOTED_PRINTABLE_NAMES() ) {
>>>>>> +            $checkval = Encode::FB_CROAK();
>>>>>> +        }
>>>>>> +    }
>>>>>> +    else {
>>>>>> +        $encoding = 'ascii';
>>>>>> +    }
>>>>>> +
>>>>>> +    # Match Python .decode()???s 'ignore' behavior
>>>>>> +    $checkval ||= \&_encode_check_ignore;
>>>>>> +
>>>>>> +    return ( $main, $sub, $encoding, $checkval );
>>>>>> +}
>>>>>> +
>>>>>> +#----------------------------------------------------------------------
>>>>>> +
>>>>>> +=head2 @lines = splitlines( $TEXT )
>>>>>> +
>>>>>> +Imitates C<str.splitlines()>. (cf. C<pydoc str>)
>>>>>> +
>>>>>> +Returns a plain list in list context. Returns the number of
>>>>>> +items to be returned in scalar context.
>>>>>> +
>>>>>> +=cut
>>>>>> +
>>>>>> +sub splitlines {
>>>>>> +    return split m<\r\n?|\n>, $_[0];
>>>>>> +}
>>>>>> +
>>>>>> +1;
>>>>>> diff --git a/lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm b/lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm
>>>>>> new file mode 100644
>>>>>> index 0000000..2617b4a
>>>>>> --- /dev/null
>>>>>> +++ b/lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm
>>>>>> @@ -0,0 +1,177 @@
>>>>>> +package Mail::SpamAssassin::Pyzor::Digest::StripHtml;
>>>>>> +
>>>>>> +# Copyright 2018 cPanel, LLC.
>>>>>> +# All rights reserved.
>>>>>> +# http://cpanel.net
>>>>>> +#
>>>>>> +# <@LICENSE>
>>>>>> +# Licensed to the Apache Software Foundation (ASF) under one or more
>>>>>> +# contributor license agreements.  See the NOTICE file distributed with
>>>>>> +# this work for additional information regarding copyright ownership.
>>>>>> +# The ASF licenses this file to you under the Apache License, Version 2.0
>>>>>> +# (the "License"); you may not use this file except in compliance with
>>>>>> +# the License.  You may obtain a copy of the License at:
>>>>>> +#
>>>>>> +#     http://www.apache.org/licenses/LICENSE-2.0
>>>>>> +#
>>>>>> +# Unless required by applicable law or agreed to in writing, software
>>>>>> +# distributed under the License is distributed on an "AS IS" BASIS,
>>>>>> +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>>>>>> +# See the License for the specific language governing permissions and
>>>>>> +# limitations under the License.
>>>>>> +# </...@LICENSE>
>>>>>> +#
>>>>>> +
>>>>>> +use strict;
>>>>>> +use warnings;
>>>>>> +
>>>>>> +=encoding utf-8
>>>>>> +
>>>>>> +=head1 NAME
>>>>>> +
>>>>>> +Mail::SpamAssassin::Pyzor::Digest::StripHtml
>>>>>> +
>>>>>> +=head1 SYNOPSIS
>>>>>> +
>>>>>> +    my $stripped = Mail::SpamAssassin::Pyzor::Digest::StripHtml::strip($html);
>>>>>> +
>>>>>> +=head1 DESCRIPTION
>>>>>> +
>>>>>> +This module attempts to duplicate pyzor???s HTML-stripping logic.
>>>>>> +
>>>>>> +=head1 ACCURACY
>>>>>> +
>>>>>> +This library cannot achieve 100%, bug-for-bug parity with pyzor
>>>>>> +because to do so would require duplicating Python???s own HTML parsing
>>>>>> +library. Since that library???s output has changed over time, and those
>>>>>> +changes in turn affect pyzor, it???s literally impossible to arrive at
>>>>>> +a single, fully-compatible reimplementation.
>>>>>> +
>>>>>> +That said, all known divergences between pyzor and this library involve
>>>>>> +invalid HTML as input.
>>>>>> +
>>>>>> +Please open bug reports for any divergences you identify, particularly
>>>>>> +if the input is valid HTML.
>>>>>> +
>>>>>> +=cut
>>>>>> +
>>>>>> +#----------------------------------------------------------------------
>>>>>> +
>>>>>> +use HTML::Parser ();
>>>>>> +
>>>>>> +our $VERSION = '0.03';
>>>>>> +
>>>>>> +#----------------------------------------------------------------------
>>>>>> +
>>>>>> +=head1 FUNCTIONS
>>>>>> +
>>>>>> +=head2 $stripped = strip( $HTML )
>>>>>> +
>>>>>> +Give it some HTML, and it???ll give back the stripped text.
>>>>>> +
>>>>>> +In B<general>, the stripping consists of removing tags as well as
>>>>>> +C<E<lt>scriptE<gt>> and C<E<lt>styleE<gt>> elements; however, it also
>>>>>> +removes HTML entities.
>>>>>> +
>>>>>> +This tries very hard to duplicate pyzor???s behavior with invalid HTML.
>>>>>> +
>>>>>> +=cut
>>>>>> +
>>>>>> +sub strip {
>>>>>> +    my ($html) = @_;
>>>>>> +
>>>>>> +    $html =~ s<\A\s+><>;
>>>>>> +    $html =~ s<\s+\z><>;
>>>>>> +
>>>>>> +    my $p = HTML::Parser->new( api_version => 3 );
>>>>>> +
>>>>>> +    my @pieces;
>>>>>> +
>>>>>> +    my $accumulate = 1;
>>>>>> +
>>>>>> +    $p->handler(
>>>>>> +        start => sub {
>>>>>> +            my ($tagname) = @_;
>>>>>> +
>>>>>> +            $accumulate = 0 if $tagname eq 'script';
>>>>>> +            $accumulate = 0 if $tagname eq 'style';
>>>>>> +
>>>>>> +            return;
>>>>>> +        },
>>>>>> +        'tagname',
>>>>>> +    );
>>>>>> +
>>>>>> +    $p->handler(
>>>>>> +        end => sub {
>>>>>> +            $accumulate = 1;
>>>>>> +            return;
>>>>>> +        }
>>>>>> +    );
>>>>>> +
>>>>>> +    $p->handler(
>>>>>> +        text => sub {
>>>>>> +            my ($copy) = @_;
>>>>>> +
>>>>>> +            return if !$accumulate;
>>>>>> +
>>>>>> +            # pyzor???s HTML parser discards HTML entities. On top of that,
>>>>>> +            # we need to match, as closely as possible, pyzor???s handling of
>>>>>> +            # invalid HTML entities ??? which is a function of Python???s
>>>>>> +            # standard HTML parsing library. This will probably never be
>>>>>> +            # fully compatible with the pyzor, but we can get it close.
>>>>>> +
>>>>>> +            # The original is:
>>>>>> +            #
>>>>>> +            #   re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
>>>>>> +            #
>>>>>> +            # The parsing loop then ???backs up??? one byte if the last
>>>>>> +            # character isn???t a ???;???. We use a look-ahead assertion to
>>>>>> +            # mimic that behavior.
>>>>>> +            $copy =~ s<\&\# (?:[0-9]+ | [xX][0-9a-fA-F]+) (?: ; | \z | (?=[^0-9a-fA-F]) )>< >gx;
>>>>>> +
>>>>>> +            # The original is:
>>>>>> +            #
>>>>>> +            #   re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
>>>>>> +            #
>>>>>> +            # We again use a look-ahead assertion to mimic Python.
>>>>>> +            $copy =~ s<\& [a-zA-Z] [-.a-zA-Z0-9]* (?: ; | \z | (?=[^a-zA-Z0-9]) )>< >gx;
>>>>>> +
>>>>>> +            # Python???s HTMLParser aborts its parsing loop when it encounters
>>>>>> +            # an invalid numeric reference.
>>>>>> +            $copy =~ s<\&\#
>>>>>> +                (?:
>>>>>> +                    [^0-9xX]        # anything but the expected first char
>>>>>> +                    |
>>>>>> +                    [0-9]+[a-fA-F]  # hex within decimal
>>>>>> +                    |
>>>>>> +                    [xX][^0-9a-fA-F]
>>>>>> +                )
>>>>>> +                (.*)
>>>>>> +            ><
>>>>>> +                ( -1 == index($1, ';') ) ? q<> : '&#'
>>>>>> +            >exs;
>>>>>> +
>>>>>> +            # Python???s HTMLParser treats invalid entities as incomplete
>>>>>> +            $copy =~ s<(\&\#?)><$1 >gx;
>>>>>> +
>>>>>> +            $copy =~ s<\A\s+><>;
>>>>>> +            $copy =~ s<\s+\z><>;
>>>>>> +
>>>>>> +            push @pieces, \$copy if length $copy;
>>>>>> +        },
>>>>>> +        'text,tagname',
>>>>>> +    );
>>>>>> +
>>>>>> +    $p->parse($html);
>>>>>> +    $p->eof();
>>>>>> +
>>>>>> +    my $payload = join( q< >, map { $$_ } @pieces );
>>>>>> +
>>>>>> +    # Convert all sequences of whitespace OTHER THAN non-breaking spaces to
>>>>>> +    # plain spaces.
>>>>>> +    $payload =~ s<[^\S\x{a0}]+>< >g;
>>>>>> +
>>>>>> +    return $payload;
>>>>>> +}
>>>>>> +
>>>>>> +1;
>>>>>> diff --git a/t/pyzor.t b/t/pyzor.t
>>>>>> index 891f38d..e4ef83f 100755
>>>>>> --- a/t/pyzor.t
>>>>>> +++ b/t/pyzor.t
>>>>>> @@ -3,12 +3,9 @@
>>>>>>   use lib '.'; use lib 't';
>>>>>>   use SATest; sa_t_init("pyzor");
>>>>>> -use constant HAS_PYZOR => eval { $_ = untaint_cmd("which pyzor"); chomp; -x };
>>>>>> -
>>>>>>   use Test::More;
>>>>>>   plan skip_all => "Net tests disabled" unless conf_bool('run_net_tests');
>>>>>> -plan skip_all => "Pyzor executable not found in path" unless HAS_PYZOR;
>>>>>> -plan tests => 8;
>>>>>> +plan tests => 5;
>>>>>>   diag('Note: Failures may not be an SpamAssassin bug, as Pyzor tests can fail due to problems with the Pyzor servers.');
>>>>>> @@ -30,7 +27,7 @@ tstprefs ("
>>>>>>   sarun ("-t < data/spam/pyzor", \&patterns_run_cb);
>>>>>>   ok_all_patterns();
>>>>>>   # Same with fork
>>>>>> -sarun ("--cf='pyzor_fork 1' -t < data/spam/pyzor", \&patterns_run_cb);
>>>>>> +sarun ("-t < data/spam/pyzor", \&patterns_run_cb);
>>>>>>   ok_all_patterns();
>>>>>>   #TESTING FOR HAM
>>>>>> @@ -44,7 +41,3 @@ ok_all_patterns();
>>>>>>   sarun ("-D pyzor -t < data/nice/001 2>&1", \&patterns_run_cb);
>>>>>>   ok_all_patterns();
>>>>>> -# same with fork
>>>>>> -sarun ("-D pyzor --cf='pyzor_fork 1' -t < data/nice/001 2>&1", \&patterns_run_cb);
>>>>>> -ok_all_patterns();
>>>>>> -
>>>>>
>>>
>> -- 
>> Kevin A. McGrail
>> KMcGrail@Apache.org
>>
>> Member, Apache Software Foundation
>> Chair Emeritus Apache SpamAssassin Project
>> https://www.linkedin.com/in/kmcgrail - 703.798.0171


Re: new Pyzor implementation

Posted by Henrik K <he...@hege.li>.
Atleast these seem completely unneeded module dependencies.

IO::SigGuard (not even found in Ubuntu packages)
Email::MIME

So the code should be refactored to use SA methods as necessary.


On Sat, Oct 16, 2021 at 11:06:07PM -0400, Kevin A. McGrail wrote:
> No worries there that I know of.
> 
> cPanel has the paperwork for CCLA on file and several people with ICLA's as
> well.  They've given us permission to commit the code too.
> 
> I think it will be better than any dependency on external binaries.
> 
> Regards,
> 
> KAM
> 
> On 10/14/2021 10:37 AM, Henrik K wrote:
> > If that's the case, I probably wouldn't have any objections.  Not sure if it
> > requires some Contributor License Agreement from cPanels part (maybe they
> > already have one), and I guess atleast a bug to make it official..  Sidney
> > or KAM can probably chime in on the admin side..
> > 
> > 
> > On Thu, Oct 14, 2021 at 04:32:53PM +0200, Giovanni Bechis wrote:
> > > Once committed, code will be no more developed by cPanel on CPAN
> > > and original code will be removed.
> > > 
> > > I can work to integrate old and new Pyzor versions.
> > > 
> > >   Giovanni
> > > 
> > > On Thu, Oct 14, 2021 at 05:27:16PM +0300, Henrik K wrote:
> > > > If it's developed by cPanel in CPAN, then it should not be committed to SA,
> > > > unless it's clearly donated to SpamAssassin and removed from CPAN.  Assuming
> > > > we have developer resources and will to take it aboard.
> > > > 
> > > > As it is, Plugin/Pyzor.pm should have an option to choose which one to use,
> > > > as it makes no sense to ditch support for the widely installed original
> > > > Pyzor.
> > > > 
> > > > 
> > > > On Thu, Oct 14, 2021 at 04:15:13PM +0200, Giovanni Bechis wrote:
> > > > > Hi,
> > > > > cPanel has developed a native Perl Pyzor implementation for SpamAssassin
> > > > > and a diff against SpamAssassin 4.0 follows.
> > > > > Atm I am using it in production on a small server, more tests and
> > > > > opinions are welcome.
> > > > > 
> > > > > Original cPanel code is at https://metacpan.org/pod/Mail::Pyzor.
> > > > > 
> > > > >   Cheers
> > > > >    Giovanni
> > > > > 
> > > > > diff --git a/MANIFEST b/MANIFEST
> > > > > index 25d0192..2d9588c 100644
> > > > > --- a/MANIFEST
> > > > > +++ b/MANIFEST
> > > > > @@ -126,6 +126,11 @@ lib/Mail/SpamAssassin/Plugin/WLBLEval.pm
> > > > >   lib/Mail/SpamAssassin/Plugin/WhiteListSubject.pm
> > > > >   lib/Mail/SpamAssassin/PluginHandler.pm
> > > > >   lib/Mail/SpamAssassin/Plugin/URILocalBL.pm
> > > > > +lib/Mail/SpamAssassin/Pyzor/Client.pm
> > > > > +lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm
> > > > > +lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm
> > > > > +lib/Mail/SpamAssassin/Pyzor/Digest.pm
> > > > > +lib/Mail/SpamAssassin/Pyzor.pm
> > > > >   lib/Mail/SpamAssassin/RegistryBoundaries.pm
> > > > >   lib/Mail/SpamAssassin/Reporter.pm
> > > > >   lib/Mail/SpamAssassin/SQLBasedAddrList.pm
> > > > > diff --git a/lib/Mail/SpamAssassin/Plugin/Pyzor.pm b/lib/Mail/SpamAssassin/Plugin/Pyzor.pm
> > > > > index 3efd4b4..e4c9c05 100644
> > > > > --- a/lib/Mail/SpamAssassin/Plugin/Pyzor.pm
> > > > > +++ b/lib/Mail/SpamAssassin/Plugin/Pyzor.pm
> > > > > @@ -36,17 +36,13 @@ package Mail::SpamAssassin::Plugin::Pyzor;
> > > > >   use Mail::SpamAssassin::Plugin;
> > > > >   use Mail::SpamAssassin::Logger;
> > > > > -use Mail::SpamAssassin::Timeout;
> > > > > -use Mail::SpamAssassin::Util qw(untaint_var untaint_file_path
> > > > > -                                proc_status_ok exit_status_str);
> > > > > +use Mail::SpamAssassin::Util qw(untaint_var);
> > > > > +
> > > > >   use strict;
> > > > >   use warnings;
> > > > >   # use bytes;
> > > > >   use re 'taint';
> > > > > -use Storable;
> > > > > -use POSIX qw(PIPE_BUF WNOHANG _exit);
> > > > > -
> > > > >   our @ISA = qw(Mail::SpamAssassin::Plugin);
> > > > >   sub new {
> > > > > @@ -78,7 +74,7 @@ sub set_config {
> > > > >     my ($self, $conf) = @_;
> > > > >     my @cmds;
> > > > > -=head1 USER OPTIONS
> > > > > +=head1 ADMINISTRATOR OPTIONS
> > > > >   =over 4
> > > > > @@ -95,22 +91,7 @@ Whether to use Pyzor, if it is available.
> > > > >       type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL
> > > > >     });
> > > > > -=item pyzor_fork (0|1)		(default: 0)
> > > > > -
> > > > > -Instead of running Pyzor synchronously, fork separate process for it and
> > > > > -read the results in later (similar to async DNS lookups).  Increases
> > > > > -throughput.  Experimental.
> > > > > -
> > > > > -=cut
> > > > > -
> > > > > -  push(@cmds, {
> > > > > -    setting => 'pyzor_fork',
> > > > > -    is_admin => 1,
> > > > > -    default => 0,
> > > > > -    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC,
> > > > > -  });
> > > > > -
> > > > > -=item pyzor_count_min NUMBER	(default: 5)
> > > > > +=item pyzor_count_min NUMBER		(default: 5)
> > > > >   This option sets how often a message's body checksum must have been
> > > > >   reported to the Pyzor server before SpamAssassin will consider the Pyzor
> > > > > @@ -128,54 +109,8 @@ set this to a relatively low value, e.g. C<5>.
> > > > >       type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
> > > > >     });
> > > > > -  # Deprecated setting, the name makes no sense!
> > > > > -  push (@cmds, {
> > > > > -    setting => 'pyzor_max',
> > > > > -    is_admin => 1,
> > > > > -    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC,
> > > > > -    code => sub {
> > > > > -      my ($self, $key, $value, $line) = @_;
> > > > > -      warn("deprecated setting used, change pyzor_max to pyzor_count_min\n");
> > > > > -      if ($value !~ /^\d+$/) {
> > > > > -        return $Mail::SpamAssassin::Conf::INVALID_VALUE;
> > > > > -      }
> > > > > -      $self->{pyzor_count_min} = $value;
> > > > > -    }
> > > > > -  });
> > > > > -
> > > > > -=item pyzor_whitelist_min NUMBER	(default: 10)
> > > > > -
> > > > > -This option sets how often a message's body checksum must have been
> > > > > -whitelisted to the Pyzor server for SpamAssassin to consider ignoring the
> > > > > -result.  Final decision is made by pyzor_whitelist_factor.
> > > > > -
> > > > > -=cut
> > > > > -
> > > > > -  push (@cmds, {
> > > > > -    setting => 'pyzor_whitelist_min',
> > > > > -    is_admin => 1,
> > > > > -    default => 10,
> > > > > -    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
> > > > > -  });
> > > > > -
> > > > > -=item pyzor_whitelist_factor NUMBER	(default: 0.2)
> > > > > -
> > > > > -Ignore Pyzor result if REPORTCOUNT x NUMBER >= pyzor_whitelist_min.
> > > > > -For default setting this means: 50 reports requires 10 whitelistings.
> > > > > -
> > > > > -=cut
> > > > > -
> > > > > -  push (@cmds, {
> > > > > -    setting => 'pyzor_whitelist_factor',
> > > > > -    is_admin => 1,
> > > > > -    default => 0.2,
> > > > > -    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
> > > > > -  });
> > > > > -
> > > > >   =back
> > > > > -=head1 ADMINISTRATOR OPTIONS
> > > > > -
> > > > >   =over 4
> > > > >   =item pyzor_timeout n		(default: 5)
> > > > > @@ -210,478 +145,182 @@ removing one of them.
> > > > >       type => $Mail::SpamAssassin::Conf::CONF_TYPE_DURATION
> > > > >     });
> > > > > -=item pyzor_options options
> > > > > +=item pyzor_whitelist_min NUMBER        (default: 10)
> > > > > -Specify additional options to the pyzor(1) command. Please note that only
> > > > > -characters in the range [0-9A-Za-z =,._/-] are allowed for security reasons.
> > > > > +This option sets how often a message's body checksum must have been
> > > > > +whitelisted to the Pyzor server for SpamAssassin to consider ignoring the
> > > > > +result.  Final decision is made by pyzor_whitelist_factor.
> > > > >   =cut
> > > > >     push (@cmds, {
> > > > > -    setting => 'pyzor_options',
> > > > > +    setting => 'pyzor_whitelist_min',
> > > > >       is_admin => 1,
> > > > > -    default => '',
> > > > > -    type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING,
> > > > > -    code => sub {
> > > > > -      my ($self, $key, $value, $line) = @_;
> > > > > -      if ($value !~ m{^([0-9A-Za-z =,._/-]+)$}) {
> > > > > -	return $Mail::SpamAssassin::Conf::INVALID_VALUE;
> > > > > -      }
> > > > > -      $self->{pyzor_options} = $1;
> > > > > -    }
> > > > > +    default => 10,
> > > > > +    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
> > > > >     });
> > > > > -=item pyzor_path STRING
> > > > > +=item pyzor_whitelist_factor NUMBER     (default: 0.2)
> > > > > -This option tells SpamAssassin specifically where to find the C<pyzor>
> > > > > -client instead of relying on SpamAssassin to find it in the current
> > > > > -PATH.  Note that if I<taint mode> is enabled in the Perl interpreter,
> > > > > -you should use this, as the current PATH will have been cleared.
> > > > > +Ignore Pyzor result if REPORTCOUNT x NUMBER >= pyzor_whitelist_min.
> > > > > +For default setting this means: 50 reports requires 10 whitelistings.
> > > > >   =cut
> > > > >     push (@cmds, {
> > > > > -    setting => 'pyzor_path',
> > > > > +    setting => 'pyzor_whitelist_factor',
> > > > >       is_admin => 1,
> > > > > -    default => undef,
> > > > > -    type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING,
> > > > > -    code => sub {
> > > > > -      my ($self, $key, $value, $line) = @_;
> > > > > -      if (!defined $value || !length $value) {
> > > > > -	return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE;
> > > > > -      }
> > > > > -      $value = untaint_file_path($value);
> > > > > -      if (!-x $value) {
> > > > > -	info("config: pyzor_path \"$value\" isn't an executable");
> > > > > -	return $Mail::SpamAssassin::Conf::INVALID_VALUE;
> > > > > -      }
> > > > > -
> > > > > -      $self->{pyzor_path} = $value;
> > > > > -    }
> > > > > +    default => 0.2,
> > > > > +    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
> > > > >     });
> > > > >     $conf->{parser}->register_commands(\@cmds);
> > > > >   }
> > > > >   sub is_pyzor_available {
> > > > > -  my ($self) = @_;
> > > > > +    my ($self) = @_;
> > > > > -  my $pyzor = $self->{main}->{conf}->{pyzor_path} ||
> > > > > -    Mail::SpamAssassin::Util::find_executable_in_env_path('pyzor');
> > > > > -
> > > > > -  unless ($pyzor && -x $pyzor) {
> > > > > -    dbg("pyzor: no pyzor executable found");
> > > > > -    $self->{pyzor_available} = 0;
> > > > > -    return 0;
> > > > > -  }
> > > > > -
> > > > > -  # remember any found pyzor
> > > > > -  $self->{main}->{conf}->{pyzor_path} = $pyzor;
> > > > > -
> > > > > -  dbg("pyzor: pyzor is available: $pyzor");
> > > > > -  return 1;
> > > > > +    local $@;
> > > > > +    eval {
> > > > > +        require Mail::SpamAssassin::Pyzor::Digest;
> > > > > +        require Mail::SpamAssassin::Pyzor::Client;
> > > > > +    };
> > > > > +    return $@ ? 0 : 1;
> > > > >   }
> > > > > -sub finish_parsing_start {
> > > > > -  my ($self, $opts) = @_;
> > > > > +sub get_pyzor_interface {
> > > > > +  my ($self) = @_;
> > > > > -  # If forking, hard adjust priority -100 to launch early
> > > > > -  # Find rulenames from eval_to_rule mappings
> > > > > -  if ($opts->{conf}->{pyzor_fork}) {
> > > > > -    foreach (@{$opts->{conf}->{eval_to_rule}->{check_pyzor}}) {
> > > > > -      dbg("pyzor: adjusting rule $_ priority to -100");
> > > > > -      $opts->{conf}->{priority}->{$_} = -100;
> > > > > -    }
> > > > > +  if (!$self->{main}->{conf}->{use_pyzor}) {
> > > > > +    dbg("pyzor: use_pyzor option not enabled, disabling Pyzor");
> > > > > +    $self->{pyzor_interface} = "disabled";
> > > > > +    $self->{pyzor_available} = 0;
> > > > > +  }
> > > > > +  elsif ($self->is_pyzor_available()) {
> > > > > +    $self->{pyzor_interface} = "pyzor";
> > > > > +    $self->{pyzor_available} = 1;
> > > > > +  }
> > > > > +  else {
> > > > > +    dbg("pyzor: no pyzor found, disabling Pyzor");
> > > > > +    $self->{pyzor_available} = 0;
> > > > >     }
> > > > >   }
> > > > >   sub check_pyzor {
> > > > > -  my ($self, $pms, $full) = @_;
> > > > > -
> > > > > -  return 0 if !$self->{pyzor_available};
> > > > > -  return 0 if !$self->{main}->{conf}->{use_pyzor};
> > > > > -
> > > > > -  return 0 if $pms->{pyzor_running};
> > > > > -  $pms->{pyzor_running} = 1;
> > > > > -
> > > > > -  return 0 if !$self->is_pyzor_available();
> > > > > -
> > > > > -  my $timer = $self->{main}->time_method("check_pyzor");
> > > > > +  my ($self, $permsgstatus, $full) = @_;
> > > > >     # initialize valid tags
> > > > > -  $pms->{tag_data}->{PYZOR} = '';
> > > > > -
> > > > > -  # create fulltext tmpfile now (before possible forking)
> > > > > -  $pms->{pyzor_tmpfile} = $pms->create_fulltext_tmpfile();
> > > > > -
> > > > > -  ## non-forking method
> > > > > -
> > > > > -  if (!$self->{main}->{conf}->{pyzor_fork}) {
> > > > > -    my @results = $self->pyzor_lookup($pms);
> > > > > -    return $self->_check_result($pms, \@results);
> > > > > -  }
> > > > > -
> > > > > -  ## forking method
> > > > > -
> > > > > -  $pms->{pyzor_rulename} = $pms->get_current_eval_rule_name();
> > > > > -  $pms->rule_pending($pms->{pyzor_rulename}); # mark async
> > > > > -
> > > > > -  # create socketpair for communication
> > > > > -  $pms->{pyzor_backchannel} = Mail::SpamAssassin::SubProcBackChannel->new();
> > > > > -  my $back_selector = '';
> > > > > -  $pms->{pyzor_backchannel}->set_selector(\$back_selector);
> > > > > -  eval {
> > > > > -    $pms->{pyzor_backchannel}->setup_backchannel_parent_pre_fork();
> > > > > -  } or do {
> > > > > -    dbg("pyzor: backchannel pre-setup failed: $@");
> > > > > -    delete $pms->{pyzor_backchannel};
> > > > > -    return 0;
> > > > > -  };
> > > > > +  $permsgstatus->{tag_data}->{PYZOR} = "";
> > > > > -  my $pid = fork();
> > > > > -  if (!defined $pid) {
> > > > > -    info("pyzor: child fork failed: $!");
> > > > > -    delete $pms->{pyzor_backchannel};
> > > > > -    return 0;
> > > > > -  }
> > > > > -  if (!$pid) {
> > > > > -    $0 = "$0 (pyzor)";
> > > > > -    $SIG{CHLD} = 'DEFAULT';
> > > > > -    $SIG{PIPE} = 'IGNORE';
> > > > > -    $SIG{$_} = sub {
> > > > > -      eval { dbg("pyzor: child process $$ caught signal $_[0]"); };
> > > > > -      _exit(6);  # avoid END and destructor processing
> > > > > -      kill('KILL',$$);  # still kicking? die!
> > > > > -      } foreach qw(INT HUP TERM TSTP QUIT USR1 USR2);
> > > > > -    dbg("pyzor: child process $$ forked");
> > > > > -    $pms->{pyzor_backchannel}->setup_backchannel_child_post_fork();
> > > > > -    my @results = $self->pyzor_lookup($pms);
> > > > > -    my $backmsg;
> > > > > -    eval {
> > > > > -      $backmsg = Storable::freeze(\@results);
> > > > > -    };
> > > > > -    if ($@) {
> > > > > -      dbg("pyzor: child return value freeze failed: $@");
> > > > > -      _exit(0); # avoid END and destructor processing
> > > > > -    }
> > > > > -    if (!syswrite($pms->{pyzor_backchannel}->{parent}, $backmsg)) {
> > > > > -      dbg("pyzor: child backchannel write failed: $!");
> > > > > -    }
> > > > > -    _exit(0); # avoid END and destructor processing
> > > > > -  }
> > > > > -
> > > > > -  $pms->{pyzor_pid} = $pid;
> > > > > +  my $timer = $self->{main}->time_method("check_pyzor");
> > > > > -  eval {
> > > > > -    $pms->{pyzor_backchannel}->setup_backchannel_parent_post_fork($pid);
> > > > > -  } or do {
> > > > > -    dbg("pyzor: backchannel post-setup failed: $@");
> > > > > -    delete $pms->{pyzor_backchannel};
> > > > > -    return 0;
> > > > > -  };
> > > > > +  $self->get_pyzor_interface();
> > > > > +  return 0 unless $self->{pyzor_available};
> > > > > -  return 0;
> > > > > +  return $self->pyzor_lookup($permsgstatus, $full);
> > > > >   }
> > > > >   sub pyzor_lookup {
> > > > > -  my ($self, $pms) = @_;
> > > > > -
> > > > > -  my $conf = $self->{main}->{conf};
> > > > > -  my $timeout = $conf->{pyzor_timeout};
> > > > > -
> > > > > -  # note: not really tainted, this came from system configuration file
> > > > > -  my $path = untaint_file_path($conf->{pyzor_path});
> > > > > -  my $opts = untaint_var($conf->{pyzor_options}) || '';
> > > > > -
> > > > > -  $pms->enter_helper_run_mode();
> > > > > -
> > > > > -  my $pid;
> > > > > -  my @resp;
> > > > > -  my $timer = Mail::SpamAssassin::Timeout->new(
> > > > > -           { secs => $timeout, deadline => $pms->{master_deadline} });
> > > > > -  my $err = $timer->run_and_catch(sub {
> > > > > -    local $SIG{PIPE} = sub { die "__brokenpipe__ignore__\n" };
> > > > > -
> > > > > -    dbg("pyzor: opening pipe: ".
> > > > > -      join(' ', $path, $opts, "check", "<".$pms->{pyzor_tmpfile}));
> > > > > -
> > > > > -    $pid = Mail::SpamAssassin::Util::helper_app_pipe_open(*PYZOR,
> > > > > -	$pms->{pyzor_tmpfile}, 1, $path, split(' ', $opts), "check");
> > > > > -    $pid or die "$!\n";
> > > > > -
> > > > > -    # read+split avoids a Perl I/O bug (Bug 5985)
> > > > > -    my($inbuf, $nread);
> > > > > -    my $resp = '';
> > > > > -    while ($nread = read(PYZOR, $inbuf, 8192)) { $resp .= $inbuf }
> > > > > -    defined $nread  or die "error reading from pipe: $!";
> > > > > -    @resp = split(/^/m, $resp, -1);
> > > > > -
> > > > > -    my $errno = 0;
> > > > > -    close PYZOR or $errno = $!;
> > > > > -    if (proc_status_ok($?, $errno)) {
> > > > > -      dbg("pyzor: [%s] finished successfully", $pid);
> > > > > -    } elsif (proc_status_ok($?, $errno, 0, 1)) {  # sometimes it exits with 1
> > > > > -      dbg("pyzor: [%s] finished: %s", $pid, exit_status_str($?, $errno));
> > > > > -    } else {
> > > > > -      info("pyzor: [%s] error: %s", $pid, exit_status_str($?, $errno));
> > > > > -    }
> > > > > -
> > > > > -  });
> > > > > -
> > > > > -  if (defined(fileno(*PYZOR))) {  # still open
> > > > > -    if ($pid) {
> > > > > -      if (kill('TERM', $pid)) {
> > > > > -        dbg("pyzor: killed stale helper [$pid]");
> > > > > -      } else {
> > > > > -        dbg("pyzor: killing helper application [$pid] failed: $!");
> > > > > -      }
> > > > > -    }
> > > > > -    my $errno = 0;
> > > > > -    close PYZOR or $errno = $!;
> > > > > -    proc_status_ok($?, $errno)
> > > > > -      or info("pyzor: [%s] error: %s", $pid, exit_status_str($?, $errno));
> > > > > -  }
> > > > > -
> > > > > -  $pms->leave_helper_run_mode();
> > > > > -
> > > > > -  if ($timer->timed_out()) {
> > > > > -    dbg("pyzor: check timed out after $timeout seconds");
> > > > > -    return ();
> > > > > -  } elsif ($err) {
> > > > > -    chomp $err;
> > > > > -    info("pyzor: check failed: $err");
> > > > > -    return ();
> > > > > -  }
> > > > > -
> > > > > -  return @resp;
> > > > > -}
> > > > > -
> > > > > -sub check_tick {
> > > > > -  my ($self, $opts) = @_;
> > > > > -  $self->_check_forked_result($opts->{permsgstatus}, 0);
> > > > > -}
> > > > > -
> > > > > -sub check_cleanup {
> > > > > -  my ($self, $opts) = @_;
> > > > > -  $self->_check_forked_result($opts->{permsgstatus}, 1);
> > > > > -}
> > > > > -
> > > > > -sub _check_forked_result {
> > > > > -  my ($self, $pms, $finish) = @_;
> > > > > -
> > > > > -  return 0 if !$pms->{pyzor_backchannel};
> > > > > -  return 0 if !$pms->{pyzor_pid};
> > > > > +    my ( $self, $permsgstatus, $fulltext ) = @_;
> > > > > +    my $conf = $self->{main}->{conf};
> > > > > +    my $timeout = $conf->{pyzor_timeout};
> > > > > +
> > > > > +    my $client = ( $self->{'_pyzor_client'} ||= Mail::SpamAssassin::Pyzor::Client->new( 'timeout' => $timeout ) );
> > > > > +    my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $fulltext );
> > > > > +
> > > > > +    local $@;
> > > > > +    my $ref = eval { $client->check($digest); };
> > > > > +    dbg("pyzor: got response: $client->{'_server_host'}");
> > > > > +    # $client reply must be an hash
> > > > > +    return 0 if (not (ref $ref eq ref {}));
> > > > > +    if ($@) {
> > > > > +        my $err = $@;
> > > > > -  my $timer = $self->{main}->time_method("check_pyzor");
> > > > > +        $err = eval { $err->get_message() } || $err;
> > > > > -  $pms->{pyzor_abort} = $pms->{deadline_exceeded} || $pms->{shortcircuited};
> > > > > -
> > > > > -  my $kid_pid = $pms->{pyzor_pid};
> > > > > -  # if $finish, force waiting for the child
> > > > > -  my $pid = waitpid($kid_pid, $finish && !$pms->{pyzor_abort} ? 0 : WNOHANG);
> > > > > -  if ($pid == 0) {
> > > > > -    #dbg("pyzor: child process $kid_pid not finished yet, trying later");
> > > > > -    if ($pms->{pyzor_abort}) {
> > > > > -      dbg("pyzor: bailing out due to deadline/shortcircuit");
> > > > > -      kill('TERM', $kid_pid);
> > > > > -      if (waitpid($kid_pid, WNOHANG) == 0) {
> > > > > -        sleep(1);
> > > > > -        if (waitpid($kid_pid, WNOHANG) == 0) {
> > > > > -          dbg("pyzor: child process $kid_pid still alive, KILL");
> > > > > -          kill('KILL', $kid_pid);
> > > > > -          waitpid($kid_pid, 0);
> > > > > +        warn("pyzor: check failed: $err\n");
> > > > > +        return 0;
> > > > > +    } elsif ( defined $ref->{'Code'} and $ref->{'Code'} ne 200 ) {
> > > > > +        if(defined $ref->{'Code'} and defined $ref->{'Diag'}) {
> > > > > +          dbg("pyzor: check failed with invalid code: $ref->{'Code'}: $ref->{'Diag'}");
> > > > > +        } else {
> > > > > +          dbg("pyzor: check failed with undefined code");
> > > > >           }
> > > > > -      }
> > > > > -      delete $pms->{pyzor_pid};
> > > > > -      delete $pms->{pyzor_backchannel};
> > > > > +        return 0;
> > > > >       }
> > > > > -    return 0;
> > > > > -  } elsif ($pid == -1) {
> > > > > -    # child does not exist?
> > > > > -    dbg("pyzor: child process $kid_pid already handled?");
> > > > > -    delete $pms->{pyzor_backchannel};
> > > > > -    return 0;
> > > > > -  }
> > > > > -  $pms->rule_ready($pms->{pyzor_rulename}); # mark rule ready for metas
> > > > > +    my $pyzor_count       = untaint_var($ref->{'Count'}) + 0;
> > > > > +    my $pyzor_whitelisted = untaint_var($ref->{'WL-Count'}) + 0;
> > > > > +    my $count_min = $conf->{pyzor_count_min};
> > > > > +    my $wl_min = $conf->{pyzor_whitelist_min};
> > > > > -  dbg("pyzor: child process $kid_pid finished, reading results");
> > > > > +    my $wl_limit = $pyzor_whitelisted >= $wl_min ?
> > > > > +      $pyzor_count * $conf->{pyzor_whitelist_factor} : 0;
> > > > > -  my $backmsg;
> > > > > -  my $ret = sysread($pms->{pyzor_backchannel}->{latest_kid_fh}, $backmsg, PIPE_BUF);
> > > > > -  if (!defined $ret || $ret == 0) {
> > > > > -    dbg("pyzor: could not read result from child: ".($ret == 0 ? 0 : $!));
> > > > > -    delete $pms->{pyzor_backchannel};
> > > > > -    return 0;
> > > > > -  }
> > > > > -
> > > > > -  delete $pms->{pyzor_backchannel};
> > > > > +    $permsgstatus->set_tag('PYZOR', "Reported $pyzor_count times, whitelisted $pyzor_whitelisted times.");
> > > > > -  my $results;
> > > > > -  eval {
> > > > > -    $results = Storable::thaw($backmsg);
> > > > > -  };
> > > > > -  if ($@) {
> > > > > -    dbg("pyzor: child return value thaw failed: $@");
> > > > > -    return;
> > > > > -  }
> > > > > -
> > > > > -  $self->_check_result($pms, $results);
> > > > > -}
> > > > > +    dbg("pyzor: result: COUNT=$pyzor_count/$count_min WHITELIST=$pyzor_whitelisted/$wl_min/%.1f",
> > > > > +      $wl_limit);
> > > > > -sub _check_result {
> > > > > -  my ($self, $pms, $results) = @_;
> > > > > -
> > > > > -  if (!@$results) {
> > > > > -    dbg("pyzor: no response from server");
> > > > > -    return 0;
> > > > > -  }
> > > > > -
> > > > > -  my $count = 0;
> > > > > -  my $count_wl = 0;
> > > > > -  foreach my $res (@$results) {
> > > > > -    chomp($res);
> > > > > -    if ($res =~ /^Traceback/) {
> > > > > -      info("pyzor: internal error, python traceback seen in response: $res");
> > > > > +    # Empty body etc results in same hash, we should skip very large numbers..
> > > > > +    if ($pyzor_count >= 1000000 || $pyzor_whitelisted >= 10000) {
> > > > > +      dbg("pyzor: result exceeded hardcoded limits, ignoring: count/wl 1000000/10000");
> > > > >         return 0;
> > > > >       }
> > > > > -    dbg("pyzor: got response: $res");
> > > > > -    # this regexp is intended to be a little bit forgiving
> > > > > -    if ($res =~ /^\S+\t.*?\t(\d+)\t(\d+)\s*$/) {
> > > > > -      # until pyzor servers can sync their DBs,
> > > > > -      # sum counts obtained from all servers
> > > > > -      $count += untaint_var($1)+0; # crazy but needs untainting
> > > > > -      $count_wl += untaint_var($2)+0;
> > > > > -    } else {
> > > > > -      # warn on failures to parse
> > > > > -      info("pyzor: failure to parse response \"$res\"");
> > > > > -    }
> > > > > -  }
> > > > > -
> > > > > -  my $conf = $self->{main}->{conf};
> > > > > -
> > > > > -  my $count_min = $conf->{pyzor_count_min};
> > > > > -  my $wl_min = $conf->{pyzor_whitelist_min};
> > > > > -  my $wl_limit = $count_wl >= $wl_min ?
> > > > > -    $count * $conf->{pyzor_whitelist_factor} : 0;
> > > > > -
> > > > > -  dbg("pyzor: result: COUNT=$count/$count_min WHITELIST=$count_wl/$wl_min/%.1f",
> > > > > -    $wl_limit);
> > > > > -  $pms->set_tag('PYZOR', "Reported $count times, whitelisted $count_wl times.");
> > > > > -
> > > > > -  # Empty body etc results in same hash, we should skip very large numbers..
> > > > > -  if ($count >= 1000000 || $count_wl >= 10000) {
> > > > > -    dbg("pyzor: result exceeded hardcoded limits, ignoring: count/wl 1000000/10000");
> > > > > -    return 0;
> > > > > -  }
> > > > > -
> > > > > -  # Whitelisted?
> > > > > -  if ($wl_limit && $count_wl >= $wl_limit) {
> > > > > -    dbg("pyzor: message whitelisted");
> > > > > -    return 0;
> > > > > -  }
> > > > > +    # Whitelisted?
> > > > > +    if ($wl_limit && $pyzor_whitelisted >= $wl_limit) {
> > > > > +      dbg("pyzor: message whitelisted");
> > > > > +      return 0;
> > > > > +    }
> > > > > -  if ($count >= $count_min) {
> > > > > -    if ($conf->{pyzor_fork}) {
> > > > > -      # forked needs to run got_hit()
> > > > > -      $pms->got_hit($pms->{pyzor_rulename}, "", ruletype => 'eval');
> > > > > +    if ( $pyzor_count >= $count_min ) {
> > > > > +      return 1;
> > > > >       }
> > > > > -    return 1;
> > > > > -  }
> > > > > -  return 0;
> > > > > +    return 0;
> > > > >   }
> > > > >   sub plugin_report {
> > > > >     my ($self, $options) = @_;
> > > > > -  return if !$self->{pyzor_available};
> > > > > -  return if !$self->{main}->{conf}->{use_pyzor};
> > > > > -  return if $options->{report}->{options}->{dont_report_to_pyzor};
> > > > > -  return if !$self->is_pyzor_available();
> > > > > -
> > > > > -  # use temporary file: open2() is unreliable due to buffering under spamd
> > > > > -  my $tmpf = $options->{report}->create_fulltext_tmpfile($options->{text});
> > > > > -  if ($self->pyzor_report($options, $tmpf)) {
> > > > > -    $options->{report}->{report_available} = 1;
> > > > > -    info("reporter: spam reported to Pyzor");
> > > > > -    $options->{report}->{report_return} = 1;
> > > > > -  }
> > > > > -  else {
> > > > > -    info("reporter: could not report spam to Pyzor");
> > > > > -  }
> > > > > -  $options->{report}->delete_fulltext_tmpfile($tmpf);
> > > > > +  return unless $self->{pyzor_available};
> > > > > +  return unless $self->{main}->{conf}->{use_pyzor};
> > > > > -  return 1;
> > > > > +  if (!$options->{report}->{options}->{dont_report_to_pyzor} && $self->is_pyzor_available())
> > > > > +  {
> > > > > +    if ($self->pyzor_report($options)) {
> > > > > +      $options->{report}->{report_available} = 1;
> > > > > +      info("reporter: spam reported to Pyzor");
> > > > > +      $options->{report}->{report_return} = 1;
> > > > > +    }
> > > > > +    else {
> > > > > +      info("reporter: could not report spam to Pyzor");
> > > > > +    }
> > > > > +  }
> > > > >   }
> > > > >   sub pyzor_report {
> > > > > -  my ($self, $options, $tmpf) = @_;
> > > > > -
> > > > > -  # note: not really tainted, this came from system configuration file
> > > > > -  my $path = untaint_file_path($options->{report}->{conf}->{pyzor_path});
> > > > > -  my $opts = untaint_var($options->{report}->{conf}->{pyzor_options}) || '';
> > > > > +    my ( $self, $options ) = @_;
> > > > > -  my $timeout = $self->{main}->{conf}->{pyzor_timeout};
> > > > > +    my $timeout = $self->{main}->{conf}->{pyzor_timeout};
> > > > > -  $options->{report}->enter_helper_run_mode();
> > > > > +    my $client = ( $self->{'_pyzor_client'} ||= Mail::SpamAssassin::Pyzor::Client->new( 'timeout' => $timeout ) );
> > > > > -  my $timer = Mail::SpamAssassin::Timeout->new({ secs => $timeout });
> > > > > -  my $err = $timer->run_and_catch(sub {
> > > > > +    my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $options->{'text'} );
> > > > > -    local $SIG{PIPE} = sub { die "__brokenpipe__ignore__\n" };
> > > > > -
> > > > > -    dbg("pyzor: opening pipe: " . join(' ', $path, $opts, "report", "< $tmpf"));
> > > > > -
> > > > > -    my $pid = Mail::SpamAssassin::Util::helper_app_pipe_open(*PYZOR,
> > > > > -	$tmpf, 1, $path, split(' ', $opts), "report");
> > > > > -    $pid or die "$!\n";
> > > > > -
> > > > > -    my($inbuf,$nread,$nread_all); $nread_all = 0;
> > > > > -    # response is ignored, just check its existence
> > > > > -    while ( $nread=read(PYZOR,$inbuf,8192) ) { $nread_all += $nread }
> > > > > -    defined $nread  or die "error reading from pipe: $!";
> > > > > -
> > > > > -    dbg("pyzor: empty response")  if $nread_all < 1;
> > > > > -
> > > > > -    my $errno = 0;  close PYZOR or $errno = $!;
> > > > > -    # closing a pipe also waits for the process executing on the pipe to
> > > > > -    # complete, no need to explicitly call waitpid
> > > > > -    # my $child_stat = waitpid($pid,0) > 0 ? $? : undef;
> > > > > -    if (proc_status_ok($?,$errno, 0)) {
> > > > > -      dbg("pyzor: [%s] reporter finished successfully", $pid);
> > > > > -    } else {
> > > > > -      info("pyzor: [%s] reporter error: %s", $pid, exit_status_str($?,$errno));
> > > > > +    local $@;
> > > > > +    my $ref = eval { $client->report($digest); };
> > > > > +    if ($@) {
> > > > > +        warn("pyzor: report failed: $@");
> > > > > +        return 0;
> > > > >       }
> > > > > -
> > > > > -  });
> > > > > -
> > > > > -  $options->{report}->leave_helper_run_mode();
> > > > > -
> > > > > -  if ($timer->timed_out()) {
> > > > > -    dbg("reporter: pyzor report timed out after $timeout seconds");
> > > > > -    return 0;
> > > > > -  }
> > > > > -
> > > > > -  if ($err) {
> > > > > -    chomp $err;
> > > > > -    if ($err eq '__brokenpipe__ignore__') {
> > > > > -      dbg("reporter: pyzor report failed: broken pipe");
> > > > > -    } else {
> > > > > -      warn("reporter: pyzor report failed: $err\n");
> > > > > +    elsif ( $ref->{'Code'} ne 200 ) {
> > > > > +        dbg("pyzor: report failed with invalid code: $ref->{'Code'}: $ref->{'Diag'}");
> > > > > +        return 0;
> > > > >       }
> > > > > -    return 0;
> > > > > -  }
> > > > > -  return 1;
> > > > > +    return 1;
> > > > >   }
> > > > > -# Version features
> > > > > -sub has_fork { 1 }
> > > > > -
> > > > >   1;
> > > > > -
> > > > > -=back
> > > > > -
> > > > > -=cut
> > > > > diff --git a/lib/Mail/SpamAssassin/Pyzor.pm b/lib/Mail/SpamAssassin/Pyzor.pm
> > > > > new file mode 100644
> > > > > index 0000000..8ac27f4
> > > > > --- /dev/null
> > > > > +++ b/lib/Mail/SpamAssassin/Pyzor.pm
> > > > > @@ -0,0 +1,56 @@
> > > > > +package Mail::SpamAssassin::Pyzor;
> > > > > +
> > > > > +# Copyright 2018 cPanel, LLC.
> > > > > +# All rights reserved.
> > > > > +# http://cpanel.net
> > > > > +#
> > > > > +# <@LICENSE>
> > > > > +# Licensed to the Apache Software Foundation (ASF) under one or more
> > > > > +# contributor license agreements.  See the NOTICE file distributed with
> > > > > +# this work for additional information regarding copyright ownership.
> > > > > +# The ASF licenses this file to you under the Apache License, Version 2.0
> > > > > +# (the "License"); you may not use this file except in compliance with
> > > > > +# the License.  You may obtain a copy of the License at:
> > > > > +#
> > > > > +#     http://www.apache.org/licenses/LICENSE-2.0
> > > > > +#
> > > > > +# Unless required by applicable law or agreed to in writing, software
> > > > > +# distributed under the License is distributed on an "AS IS" BASIS,
> > > > > +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> > > > > +# See the License for the specific language governing permissions and
> > > > > +# limitations under the License.
> > > > > +# </...@LICENSE>
> > > > > +#
> > > > > +
> > > > > +use strict;
> > > > > +use warnings;
> > > > > +
> > > > > +our $VERSION = '0.06_01';
> > > > > +
> > > > > +=encoding utf-8
> > > > > +
> > > > > +=head1 NAME
> > > > > +
> > > > > +Mail::SpamAssassin::Pyzor - Pyzor spam filtering in Perl
> > > > > +
> > > > > +=head1 DESCRIPTION
> > > > > +
> > > > > +This distribution contains Perl implementations of parts of
> > > > > +L<Pyzor|http://pyzor.org>, a tool for use in spam email filtering.
> > > > > +It is intended for use with L<Mail::SpamAssassin> but may be useful
> > > > > +in other contexts.
> > > > > +
> > > > > +See the following modules for information on specific tools that
> > > > > +the distribution includes:
> > > > > +
> > > > > +=over
> > > > > +
> > > > > +=item * L<Mail::SpamAssassin::Pyzor::Client>
> > > > > +
> > > > > +=item * L<Mail::SpamAssassin::Pyzor::Digest>
> > > > > +
> > > > > +=back
> > > > > +
> > > > > +=cut
> > > > > +
> > > > > +1;
> > > > > diff --git a/lib/Mail/SpamAssassin/Pyzor/Client.pm b/lib/Mail/SpamAssassin/Pyzor/Client.pm
> > > > > new file mode 100644
> > > > > index 0000000..ccff868
> > > > > --- /dev/null
> > > > > +++ b/lib/Mail/SpamAssassin/Pyzor/Client.pm
> > > > > @@ -0,0 +1,415 @@
> > > > > +package Mail::SpamAssassin::Pyzor::Client;
> > > > > +
> > > > > +# Copyright 2018 cPanel, LLC.
> > > > > +# All rights reserved.
> > > > > +# http://cpanel.net
> > > > > +#
> > > > > +# <@LICENSE>
> > > > > +# Licensed to the Apache Software Foundation (ASF) under one or more
> > > > > +# contributor license agreements.  See the NOTICE file distributed with
> > > > > +# this work for additional information regarding copyright ownership.
> > > > > +# The ASF licenses this file to you under the Apache License, Version 2.0
> > > > > +# (the "License"); you may not use this file except in compliance with
> > > > > +# the License.  You may obtain a copy of the License at:
> > > > > +#
> > > > > +#     http://www.apache.org/licenses/LICENSE-2.0
> > > > > +#
> > > > > +# Unless required by applicable law or agreed to in writing, software
> > > > > +# distributed under the License is distributed on an "AS IS" BASIS,
> > > > > +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> > > > > +# See the License for the specific language governing permissions and
> > > > > +# limitations under the License.
> > > > > +# </...@LICENSE>
> > > > > +#
> > > > > +
> > > > > +use strict;
> > > > > +use warnings;
> > > > > +
> > > > > +=encoding utf-8
> > > > > +
> > > > > +=head1 NAME
> > > > > +
> > > > > +Mail::SpamAssassin::Pyzor::Client - Pyzor client logic
> > > > > +
> > > > > +=head1 SYNOPSIS
> > > > > +
> > > > > +    use Mail::SpamAssassin::Pyzor::Client ();
> > > > > +    use Mail::SpamAssassin::Pyzor::Digest ();
> > > > > +
> > > > > +    my $client = Mail::SpamAssassin::Pyzor::Client->new();
> > > > > +
> > > > > +    my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $msg );
> > > > > +
> > > > > +    my $check_ref = $client->check($digest);
> > > > > +    die $check_ref->{'Diag'} if $check_ref->{'Code'} ne '200';
> > > > > +
> > > > > +    my $report_ref = $client->report($digest);
> > > > > +    die $report_ref->{'Diag'} if $report_ref->{'Code'} ne '200';
> > > > > +
> > > > > +=head1 DESCRIPTION
> > > > > +
> > > > > +A bare-bones L<Pyzor|http://pyzor.org> client that currently only
> > > > > +implements the functionality needed for L<Mail::SpamAssassin>.
> > > > > +
> > > > > +=head1 PROTOCOL DETAILS
> > > > > +
> > > > > +The Pyzor protocol is not a published standard, and there appears to be
> > > > > +no meaningful public documentation. What follows is enough information,
> > > > > +largely gleaned through forum posts and reverse engineering, to facilitate
> > > > > +effective use of this module:
> > > > > +
> > > > > +Pyzor is an RPC-oriented, message-based protocol. Each message
> > > > > +is a simple dictionary of 7-bit ASCII keys and values. Server responses
> > > > > +always include at least the following:
> > > > > +
> > > > > +=over
> > > > > +
> > > > > +=item * C<Code> - Similar to HTTP status codes; anything besides C<200>
> > > > > +is an error.
> > > > > +
> > > > > +=item * C<Diag> - Similar to HTTP status reasons: a text description
> > > > > +of the status.
> > > > > +
> > > > > +=back
> > > > > +
> > > > > +(NB: There are additional standard response headers that are useful only for
> > > > > +the protocol itself and thus are not part of this module???s returns.)
> > > > > +
> > > > > +=head2 Reliability
> > > > > +
> > > > > +Pyzor uses UDP rather than TCP, so no message is guaranteed to reach its
> > > > > +destination. A transmission failure can happen in either the request or
> > > > > +the response; in either case, a timeout error will result. Such errors
> > > > > +are represented as thrown instances of L<Mail::Pyzor::X::Timeout>.
> > > > > +
> > > > > +=cut
> > > > > +
> > > > > +#----------------------------------------------------------------------
> > > > > +
> > > > > +our $VERSION = '0.04';
> > > > > +
> > > > > +our $DEFAULT_SERVER_HOST    = 'public.pyzor.org';
> > > > > +our $DEFAULT_SERVER_PORT    = 24441;
> > > > > +our $DEFAULT_USERNAME       = 'anonymous';
> > > > > +our $DEFAULT_PASSWORD       = '';
> > > > > +our $DEFAULT_OP_SPEC        = '20,3,60,3';
> > > > > +our $PYZOR_PROTOCOL_VERSION = 2.1;
> > > > > +our $DEFAULT_TIMEOUT        = 3.5;
> > > > > +our $READ_SIZE              = 8192;
> > > > > +
> > > > > +use IO::Socket::INET ();
> > > > > +use Digest::SHA qw(sha1 sha1_hex);
> > > > > +
> > > > > +my @hash_order = ( 'Op', 'Op-Digest', 'Op-Spec', 'Thread', 'PV', 'User', 'Time', 'Sig' );
> > > > > +
> > > > > +#----------------------------------------------------------------------
> > > > > +
> > > > > +=head1 CONSTRUCTOR
> > > > > +
> > > > > +=head2 new(%OPTS)
> > > > > +
> > > > > +Create a new pyzor client.
> > > > > +
> > > > > +=over 2
> > > > > +
> > > > > +=item Input
> > > > > +
> > > > > +%OPTS are (all optional):
> > > > > +
> > > > > +=over 3
> > > > > +
> > > > > +=item * C<server_host> - The pyzor server host to connect to (default is
> > > > > +C<public.pyzor.org>)
> > > > > +
> > > > > +=item * C<server_port> - The pyzor server port to connect to (default is
> > > > > +24441)
> > > > > +
> > > > > +=item * C<username> - The username to present to the pyzor server (default
> > > > > +is C<anonymous>)
> > > > > +
> > > > > +=item * C<password> - The password to present to the pyzor server (default
> > > > > +is empty)
> > > > > +
> > > > > +=item * C<timeout> - The maximum time, in seconds, to wait for a response
> > > > > +from the pyzor server (defeault is 3.5)
> > > > > +
> > > > > +=back
> > > > > +
> > > > > +=item Output
> > > > > +
> > > > > +=over 3
> > > > > +
> > > > > +Returns a L<Mail::SpamAssassin::Pyzor::Client> object.
> > > > > +
> > > > > +=back
> > > > > +
> > > > > +=back
> > > > > +
> > > > > +=cut
> > > > > +
> > > > > +sub new {
> > > > > +    my ( $class, %OPTS ) = @_;
> > > > > +
> > > > > +    return bless {
> > > > > +        '_server_host' => $OPTS{'server_host'} || $DEFAULT_SERVER_HOST,
> > > > > +        '_server_port' => $OPTS{'server_port'} || $DEFAULT_SERVER_PORT,
> > > > > +        '_username'    => $OPTS{'username'}    || $DEFAULT_USERNAME,
> > > > > +        '_password'    => $OPTS{'password'}    || $DEFAULT_PASSWORD,
> > > > > +        '_op_spec'     => $DEFAULT_OP_SPEC,
> > > > > +        '_timeout'     => $OPTS{'timeout'} || $DEFAULT_TIMEOUT,
> > > > > +    }, $class;
> > > > > +}
> > > > > +
> > > > > +#----------------------------------------------------------------------
> > > > > +
> > > > > +=head1 REQUEST METHODS
> > > > > +
> > > > > +=head2 report($digest)
> > > > > +
> > > > > +Report the digest of a spam message to the pyzor server. This function
> > > > > +will throw if a messaging failure or timeout happens.
> > > > > +
> > > > > +=over 2
> > > > > +
> > > > > +=item Input
> > > > > +
> > > > > +=over 3
> > > > > +
> > > > > +=item $digest C<SCALAR>
> > > > > +
> > > > > +The message digest to report, as given by
> > > > > +C<Mail::SpamAssassin::Pyzor::Digest::get()>.
> > > > > +
> > > > > +=back
> > > > > +
> > > > > +=item Output
> > > > > +
> > > > > +=over 3
> > > > > +
> > > > > +=item C<HASHREF>
> > > > > +
> > > > > +Returns a hashref of the standard attributes noted above.
> > > > > +
> > > > > +=back
> > > > > +
> > > > > +=back
> > > > > +
> > > > > +=cut
> > > > > +
> > > > > +sub report {
> > > > > +    my ( $self, $digest ) = @_;
> > > > > +
> > > > > +    my $msg_ref = $self->_get_base_msg( 'report', $digest );
> > > > > +
> > > > > +    $msg_ref->{'Op-Spec'} = $self->{'_op_spec'};
> > > > > +
> > > > > +    return $self->_send_receive_msg($msg_ref);
> > > > > +}
> > > > > +
> > > > > +=head2 check($digest)
> > > > > +
> > > > > +Check the digest of a message to see if
> > > > > +the pyzor server has a report for it. This function
> > > > > +will throw if a messaging failure or timeout happens.
> > > > > +
> > > > > +=over 2
> > > > > +
> > > > > +=item Input
> > > > > +
> > > > > +=over 3
> > > > > +
> > > > > +=item $digest C<SCALAR>
> > > > > +
> > > > > +The message digest to check, as given by
> > > > > +C<Mail::SpamAssassin::Pyzor::Digest::get()>.
> > > > > +
> > > > > +=back
> > > > > +
> > > > > +=item Output
> > > > > +
> > > > > +=over 3
> > > > > +
> > > > > +=item C<HASHREF>
> > > > > +
> > > > > +Returns a hashref of the standard attributes noted above
> > > > > +as well as the following:
> > > > > +
> > > > > +=over
> > > > > +
> > > > > +=item * C<Count> - The number of reports the server has received
> > > > > +for the given digest.
> > > > > +
> > > > > +=item * C<WL-Count> - The number of whitelist requests the server has received
> > > > > +for the given digest.
> > > > > +
> > > > > +=back
> > > > > +
> > > > > +=back
> > > > > +
> > > > > +=back
> > > > > +
> > > > > +=cut
> > > > > +
> > > > > +sub check {
> > > > > +    my ( $self, $digest ) = @_;
> > > > > +
> > > > > +    return $self->_send_receive_msg( $self->_get_base_msg( 'check', $digest ) );
> > > > > +}
> > > > > +
> > > > > +# ----------------------------------------
> > > > > +
> > > > > +sub _send_receive_msg {
> > > > > +    my ( $self, $msg_ref ) = @_;
> > > > > +
> > > > > +    my $thread_id = $msg_ref->{'Thread'} or warn 'No thread ID?';
> > > > > +
> > > > > +    $self->_sign_msg($msg_ref);
> > > > > +
> > > > > +    return $self->_do_send_receive(
> > > > > +        $self->_generate_packet_from_message($msg_ref) . "\n\n",
> > > > > +        $thread_id,
> > > > > +    );
> > > > > +}
> > > > > +
> > > > > +sub _get_base_msg {
> > > > > +    my ( $self, $op, $digest ) = @_;
> > > > > +
> > > > > +    die "Implementor error: op is required" if !$op;
> > > > > +    die "error: digest is required"         if !$digest;
> > > > > +
> > > > > +    return {
> > > > > +        'User'      => $self->{'_username'},
> > > > > +        'PV'        => $PYZOR_PROTOCOL_VERSION,
> > > > > +        'Time'      => time(),
> > > > > +        'Op'        => $op,
> > > > > +        'Op-Digest' => $digest,
> > > > > +        'Thread'    => $self->_generate_thread_id()
> > > > > +    };
> > > > > +}
> > > > > +
> > > > > +sub _do_send_receive {
> > > > > +    my ( $self, $packet, $thread_id ) = @_;
> > > > > +
> > > > > +    my $sock = $self->_get_connection_or_die();
> > > > > +
> > > > > +    $self->_send_packet( $sock, $packet );
> > > > > +    my $response = $self->_receive_packet( $sock, $thread_id );
> > > > > +
> > > > > +    return 0 if not defined $response;
> > > > > +
> > > > > +    my $resp_hr = { map { ( split(m{: }) )[ 0, 1 ] } split( m{\n}, $response ) };
> > > > > +
> > > > > +    delete $resp_hr->{'Thread'};
> > > > > +
> > > > > +    my $response_pv = delete $resp_hr->{'PV'};
> > > > > +
> > > > > +    if ( $PYZOR_PROTOCOL_VERSION ne $response_pv ) {
> > > > > +        warn "Unexpected protocol version ($response_pv) in Pyzor response!";
> > > > > +    }
> > > > > +
> > > > > +    return $resp_hr;
> > > > > +}
> > > > > +
> > > > > +sub _receive_packet {
> > > > > +    my ( $self, $sock, $thread_id ) = @_;
> > > > > +
> > > > > +    my $timeout = $self->{'_timeout'} * 1000;
> > > > > +
> > > > > +    my $end_time = time + $self->{'_timeout'};
> > > > > +
> > > > > +    $sock->blocking(0);
> > > > > +    my $response = '';
> > > > > +    my $rout     = '';
> > > > > +    my $rin      = '';
> > > > > +    vec( $rin, fileno($sock), 1 ) = 1;
> > > > > +
> > > > > +    while (1) {
> > > > > +        my $time_left = $end_time - time;
> > > > > +
> > > > > +        if ( $time_left <= 0 ) {
> > > > > +          warn("Did not receive a response from the pyzor server $self->{'_server_host'}:$self->{'_server_port'} for $self->{'_timeout'} seconds!");
> > > > > +          return;
> > > > > +        }
> > > > > +
> > > > > +        my $bytes = sysread( $sock, $response, $READ_SIZE, length $response );
> > > > > +        if ( !defined($bytes) && !$!{'EAGAIN'} && !$!{'EWOULDBLOCK'} ) {
> > > > > +            warn "read from socket: $!";
> > > > > +        }
> > > > > +
> > > > > +        if ( index( $response, "\n\n" ) > -1 ) {
> > > > > +
> > > > > +            # Reject the response unless its thread ID matches what we sent.
> > > > > +            # This prevents confusion among concurrent Pyzor reqeusts.
> > > > > +            if ( index( $response, "\nThread: $thread_id\n" ) != -1 ) {
> > > > > +                last;
> > > > > +            }
> > > > > +            else {
> > > > > +                $response = '';
> > > > > +            }
> > > > > +        }
> > > > > +
> > > > > +        my $found = select( $rout = $rin, undef, undef, $time_left );
> > > > > +        warn "select(): $!" if $found == -1;
> > > > > +    }
> > > > > +
> > > > > +    return $response;
> > > > > +}
> > > > > +
> > > > > +sub _send_packet {
> > > > > +    my ( $self, $sock, $packet ) = @_;
> > > > > +
> > > > > +    $sock->blocking(1);
> > > > > +    syswrite( $sock, $packet ) or warn "write to socket: $!";
> > > > > +
> > > > > +    return;
> > > > > +}
> > > > > +
> > > > > +sub _get_connection_or_die {
> > > > > +    my ($self) = @_;
> > > > > +
> > > > > +    # clear the socket if the PID changes
> > > > > +    if ( defined $self->{'_sock_pid'} && $self->{'_sock_pid'} != $$ ) {
> > > > > +        undef $self->{'_sock_pid'};
> > > > > +        undef $self->{'_sock'};
> > > > > +    }
> > > > > +
> > > > > +    $self->{'_sock_pid'} ||= $$;
> > > > > +    $self->{'_sock'}     ||= IO::Socket::INET->new(
> > > > > +        'PeerHost' => $self->{'_server_host'},
> > > > > +        'PeerPort' => $self->{'_server_port'},
> > > > > +        'Proto'    => 'udp'
> > > > > +    ) or die "Cannot connect to $self->{'_server_host'}:$self->{'_server_port'}: $@ $!";
> > > > > +
> > > > > +    return $self->{'_sock'};
> > > > > +}
> > > > > +
> > > > > +sub _sign_msg {
> > > > > +    my ( $self, $msg_ref ) = @_;
> > > > > +
> > > > > +    $msg_ref->{'Sig'} = lc Digest::SHA::sha1_hex(
> > > > > +        Digest::SHA::sha1( $self->_generate_packet_from_message($msg_ref) )
> > > > > +    );
> > > > > +
> > > > > +    return 1;
> > > > > +}
> > > > > +
> > > > > +sub _generate_packet_from_message {
> > > > > +    my ( $self, $msg_ref ) = @_;
> > > > > +
> > > > > +    return join( "\n", map { "$_: $msg_ref->{$_}" } grep { length $msg_ref->{$_} } @hash_order );
> > > > > +}
> > > > > +
> > > > > +sub _generate_thread_id {
> > > > > +    my $RAND_MAX = 2**16;
> > > > > +    my $val      = 0;
> > > > > +    $val = int rand($RAND_MAX) while $val < 1024;
> > > > > +    return $val;
> > > > > +}
> > > > > +
> > > > > +sub _get_user_pass_hash_key {
> > > > > +    my ($self) = @_;
> > > > > +
> > > > > +    return lc Digest::SHA::sha1_hex( $self->{'_username'} . ':' . $self->{'_password'} );
> > > > > +}
> > > > > +
> > > > > +1;
> > > > > diff --git a/lib/Mail/SpamAssassin/Pyzor/Digest.pm b/lib/Mail/SpamAssassin/Pyzor/Digest.pm
> > > > > new file mode 100644
> > > > > index 0000000..0e8a5ae
> > > > > --- /dev/null
> > > > > +++ b/lib/Mail/SpamAssassin/Pyzor/Digest.pm
> > > > > @@ -0,0 +1,103 @@
> > > > > +package Mail::SpamAssassin::Pyzor::Digest;
> > > > > +
> > > > > +# Copyright 2018 cPanel, LLC.
> > > > > +# All rights reserved.
> > > > > +# http://cpanel.net
> > > > > +#
> > > > > +# <@LICENSE>
> > > > > +# Licensed to the Apache Software Foundation (ASF) under one or more
> > > > > +# contributor license agreements.  See the NOTICE file distributed with
> > > > > +# this work for additional information regarding copyright ownership.
> > > > > +# The ASF licenses this file to you under the Apache License, Version 2.0
> > > > > +# (the "License"); you may not use this file except in compliance with
> > > > > +# the License.  You may obtain a copy of the License at:
> > > > > +#
> > > > > +#     http://www.apache.org/licenses/LICENSE-2.0
> > > > > +#
> > > > > +# Unless required by applicable law or agreed to in writing, software
> > > > > +# distributed under the License is distributed on an "AS IS" BASIS,
> > > > > +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> > > > > +# See the License for the specific language governing permissions and
> > > > > +# limitations under the License.
> > > > > +# </...@LICENSE>
> > > > > +#
> > > > > +
> > > > > +use strict;
> > > > > +use warnings;
> > > > > +
> > > > > +=encoding utf-8
> > > > > +
> > > > > +=head1 NAME
> > > > > +
> > > > > +Mail::SpamAssassin::Pyzor::Digest
> > > > > +
> > > > > +=head1 SYNOPSIS
> > > > > +
> > > > > +    my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $mime_text );
> > > > > +
> > > > > +=head1 DESCRIPTION
> > > > > +
> > > > > +A reimplementation of L<https://github.com/SpamExperts/pyzor/blob/master/pyzor/digest.py>.
> > > > > +
> > > > > +=cut
> > > > > +
> > > > > +#----------------------------------------------------------------------
> > > > > +
> > > > > +use Email::MIME ();
> > > > > +
> > > > > +use Mail::SpamAssassin::Pyzor::Digest::Pieces ();
> > > > > +use Digest::SHA qw(sha1_hex);
> > > > > +
> > > > > +our $VERSION = '0.03';
> > > > > +
> > > > > +#----------------------------------------------------------------------
> > > > > +
> > > > > +=head1 FUNCTIONS
> > > > > +
> > > > > +=head2 $hex = get( $MSG )
> > > > > +
> > > > > +This takes an email message in raw MIME text format (i.e., as saved in the
> > > > > +standard mbox format) and returns the message???s Pyzor digest in lower-case
> > > > > +hexadecimal.
> > > > > +
> > > > > +The output from this function should normally be identical to that of
> > > > > +the C<pyzor> script???s C<digest> command. It is suitable for use in
> > > > > +L<Mail::SpamAssassin::Pyzor::Client>???s request methods.
> > > > > +
> > > > > +=cut
> > > > > +
> > > > > +sub get {
> > > > > +    my ($text) = @_;
> > > > > +    return Digest::SHA::sha1_hex( ${ _get_predigest( $text ) } );
> > > > > +}
> > > > > +
> > > > > +# NB: This is called from the test.
> > > > > +sub _get_predigest {    ## no critic qw(RequireArgUnpacking)
> > > > > +    my ($msg_text_sr) = @_;
> > > > > +
> > > > > +    my $parsed = Email::MIME->new($$msg_text_sr);
> > > > > +
> > > > > +    my @lines;
> > > > > +
> > > > > +    my $payloads_ar = Mail::SpamAssassin::Pyzor::Digest::Pieces::digest_payloads($parsed);
> > > > > +
> > > > > +    for my $payload (@$payloads_ar) {
> > > > > +        my @p_lines = Mail::SpamAssassin::Pyzor::Digest::Pieces::splitlines($payload);
> > > > > +        for my $line (@p_lines) {
> > > > > +            Mail::SpamAssassin::Pyzor::Digest::Pieces::normalize($line);
> > > > > +
> > > > > +            next if !Mail::SpamAssassin::Pyzor::Digest::Pieces::should_handle_line($line);
> > > > > +
> > > > > +            # Make sure we have an octet string.
> > > > > +            utf8::encode($line) if utf8::is_utf8($line);
> > > > > +
> > > > > +            push @lines, $line;
> > > > > +        }
> > > > > +    }
> > > > > +
> > > > > +    my $digest_sr = Mail::SpamAssassin::Pyzor::Digest::Pieces::assemble_lines( \@lines );
> > > > > +
> > > > > +    return $digest_sr;
> > > > > +}
> > > > > +
> > > > > +1;
> > > > > diff --git a/lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm b/lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm
> > > > > new file mode 100644
> > > > > index 0000000..522accd
> > > > > --- /dev/null
> > > > > +++ b/lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm
> > > > > @@ -0,0 +1,301 @@
> > > > > +package Mail::SpamAssassin::Pyzor::Digest::Pieces;
> > > > > +
> > > > > +# Copyright 2018 cPanel, LLC.
> > > > > +# All rights reserved.
> > > > > +# http://cpanel.net
> > > > > +#
> > > > > +# <@LICENSE>
> > > > > +# Licensed to the Apache Software Foundation (ASF) under one or more
> > > > > +# contributor license agreements.  See the NOTICE file distributed with
> > > > > +# this work for additional information regarding copyright ownership.
> > > > > +# The ASF licenses this file to you under the Apache License, Version 2.0
> > > > > +# (the "License"); you may not use this file except in compliance with
> > > > > +# the License.  You may obtain a copy of the License at:
> > > > > +#
> > > > > +#     http://www.apache.org/licenses/LICENSE-2.0
> > > > > +#
> > > > > +# Unless required by applicable law or agreed to in writing, software
> > > > > +# distributed under the License is distributed on an "AS IS" BASIS,
> > > > > +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> > > > > +# See the License for the specific language governing permissions and
> > > > > +# limitations under the License.
> > > > > +# </...@LICENSE>
> > > > > +#
> > > > > +
> > > > > +use strict;
> > > > > +use warnings;
> > > > > +
> > > > > +=encoding utf-8
> > > > > +
> > > > > +=head1 NAME
> > > > > +
> > > > > +Mail::SpamAssassin::Pyzor::Digest::Pieces
> > > > > +
> > > > > +=head1 DESCRIPTION
> > > > > +
> > > > > +This module houses backend logic for L<Mail::SpamAssassin::Pyzor::Digest>.
> > > > > +
> > > > > +It reimplements logic found in pyzor???s F<digest.py> module
> > > > > +(L<https://github.com/SpamExperts/pyzor/blob/master/pyzor/digest.py>).
> > > > > +
> > > > > +=cut
> > > > > +
> > > > > +#----------------------------------------------------------------------
> > > > > +
> > > > > +use Email::MIME::ContentType ();
> > > > > +use Encode                   ();
> > > > > +
> > > > > +our $VERSION = '0.03';
> > > > > +
> > > > > +# each tuple is [ offset, length ]
> > > > > +use constant _HASH_SPEC => ( [ 20, 3 ], [ 60, 3 ] );
> > > > > +
> > > > > +use constant {
> > > > > +    _MIN_LINE_LENGTH => 8,
> > > > > +
> > > > > +    _ATOMIC_NUM_LINES => 4,
> > > > > +};
> > > > > +
> > > > > +#----------------------------------------------------------------------
> > > > > +
> > > > > +=head1 FUNCTIONS
> > > > > +
> > > > > +=head2 $strings_ar = digest_payloads( $EMAIL_MIME )
> > > > > +
> > > > > +This imitates the corresponding object method in F<digest.py>.
> > > > > +It returns a reference to an array of strings. Each string can be either
> > > > > +a byte string or a character string (e.g., UTF-8 decoded).
> > > > > +
> > > > > +NB: RFC 2822 stipulates that message bodies should use CRLF
> > > > > +line breaks, not plain LF (nor plain CR). L<Email::MIME::Encodings>
> > > > > +will thus convert any plain CRs in a quoted-printable message
> > > > > +body into CRLF. Python, though, doesn???t do this, so the output of
> > > > > +our implementation of C<digest_payloads()> diverges from that of the Python
> > > > > +original. It doesn???t ultimately make a difference since the line-ending
> > > > > +whitespace gets trimmed regardless, but it???s necessary to factor in when
> > > > > +comparing the output of our implementation with the Python output.
> > > > > +
> > > > > +=cut
> > > > > +
> > > > > +sub digest_payloads {
> > > > > +    my ($parsed) = @_;
> > > > > +
> > > > > +    my @subparts = $parsed->subparts();
> > > > > +
> > > > > +    my @payloads;
> > > > > +
> > > > > +    if (@subparts) {
> > > > > +        @payloads = map { @{ digest_payloads($_) } } $parsed->subparts();
> > > > > +    }
> > > > > +    else {
> > > > > +        my ( $main_type, $subtype, $encoding, $encode_check ) = parse_content_type( $parsed->content_type() );
> > > > > +
> > > > > +        my $payload;
> > > > > +
> > > > > +        if ( $main_type eq 'text' ) {
> > > > > +
> > > > > +            # Decode transfer encoding, but leave us as a byte string.
> > > > > +            # Note that this is where Email::MIME converts plain LF to CRLF.
> > > > > +            $payload = $parsed->body();
> > > > > +
> > > > > +            # This does the actual character decoding (i.e., ???charset???).
> > > > > +            $payload = Encode::decode( $encoding, $payload, $encode_check );
> > > > > +
> > > > > +            if ( $subtype eq 'html' ) {
> > > > > +                require Mail::SpamAssassin::Pyzor::Digest::StripHtml;
> > > > > +                $payload = Mail::SpamAssassin::Pyzor::Digest::StripHtml::strip($payload);
> > > > > +            }
> > > > > +        }
> > > > > +        else {
> > > > > +
> > > > > +            # This does no decoding, even of, e.g., quoted-printable or base64.
> > > > > +            $payload = $parsed->body_raw();
> > > > > +        }
> > > > > +
> > > > > +        push @payloads, $payload;
> > > > > +    }
> > > > > +
> > > > > +    return \@payloads;
> > > > > +}
> > > > > +
> > > > > +#----------------------------------------------------------------------
> > > > > +
> > > > > +=head2 normalize( $STRING )
> > > > > +
> > > > > +This imitates the corresponding object method in F<digest.py>.
> > > > > +It modifies C<$STRING> in-place.
> > > > > +
> > > > > +As with the original implementation, if C<$STRING> contains (decoded)
> > > > > +Unicode characters, those characters will be parsed accordingly. So:
> > > > > +
> > > > > +    $str = "123\xc2\xa0";   # [ c2 a0 ] == \u00a0, non-breaking space
> > > > > +
> > > > > +    normalize($str);
> > > > > +
> > > > > +The above will leave C<$str> alone, but this:
> > > > > +
> > > > > +    utf8::decode($str);
> > > > > +
> > > > > +    normalize($str);
> > > > > +
> > > > > +??? will trim off the last two bytes from C<$str>.
> > > > > +
> > > > > +=cut
> > > > > +
> > > > > +sub normalize {    ## no critic qw( Subroutines::RequireArgUnpacking )
> > > > > +
> > > > > +    # NULs are bad, mm-kay?
> > > > > +    $_[0] =~ tr<\0><>d;
> > > > > +
> > > > > +    # NB: Python???s \s without re.UNICODE is the same as Perl???s \s
> > > > > +    # with the /a modifier.
> > > > > +    #
> > > > > +    # https://docs.python.org/2/library/re.html
> > > > > +    # https://perldoc.perl.org/perlrecharclass.html#Backslash-sequences
> > > > > +
> > > > > +    # Python: re.compile(r'\S{10,}')
> > > > > +    $_[0] =~ s<\S{10,}><>ag;
> > > > > +
> > > > > +    # Python: re.compile(r'\S+@\S+')
> > > > > +    $_[0] =~ s<\S+ @ \S+><>agx;
> > > > > +
> > > > > +    # Python: re.compile(r'[a-z]+:\S+', re.IGNORECASE)
> > > > > +    $_[0] =~ s<[a-zA-Z]+ : \S+><>agx;
> > > > > +
> > > > > +    # (from digest.py ???)
> > > > > +    # Make sure we do the whitespace last because some of the previous
> > > > > +    # patterns rely on whitespace.
> > > > > +    $_[0] =~ tr< \x09-\x0d><>d;
> > > > > +
> > > > > +    # This is fun. digest.py???s normalize() does a non-UNICODE whitespace
> > > > > +    # strip, then calls strip() on the string, which *will* strip Unicode
> > > > > +    # whitespace from the ends.
> > > > > +    $_[0] =~ s<\A\s+><>;
> > > > > +    $_[0] =~ s<\s+\z><>;
> > > > > +
> > > > > +    return;
> > > > > +}
> > > > > +
> > > > > +#----------------------------------------------------------------------
> > > > > +
> > > > > +=head2 $yn = should_handle_line( $STRING )
> > > > > +
> > > > > +This imitates the corresponding object method in F<digest.py>.
> > > > > +It returns a boolean.
> > > > > +
> > > > > +=cut
> > > > > +
> > > > > +sub should_handle_line {
> > > > > +    return $_[0] && length( $_[0] ) >= _MIN_LINE_LENGTH();
> > > > > +}
> > > > > +
> > > > > +#----------------------------------------------------------------------
> > > > > +
> > > > > +=head2 $sr = assemble_lines( \@LINES )
> > > > > +
> > > > > +This assembles a string buffer out of @LINES. The string is the buffer
> > > > > +of octets that will be hashed to produce the message digest.
> > > > > +
> > > > > +Each member of @LINES is expected to be an B<octet string>, not a
> > > > > +character string.
> > > > > +
> > > > > +=cut
> > > > > +
> > > > > +sub assemble_lines {
> > > > > +    my ($lines_ar) = @_;
> > > > > +
> > > > > +    if ( @$lines_ar <= _ATOMIC_NUM_LINES() ) {
> > > > > +
> > > > > +        # cf. handle_atomic() in digest.py
> > > > > +        return \join( q<>, @$lines_ar );
> > > > > +    }
> > > > > +
> > > > > +    #----------------------------------------------------------------------
> > > > > +    # cf. handle_atomic() in digest.py
> > > > > +
> > > > > +    my $str = q<>;
> > > > > +
> > > > > +    for my $ofs_len ( _HASH_SPEC() ) {
> > > > > +        my ( $offset, $length ) = @$ofs_len;
> > > > > +
> > > > > +        for my $i ( 0 .. ( $length - 1 ) ) {
> > > > > +            my $idx = int( $offset * @$lines_ar / 100 ) + $i;
> > > > > +
> > > > > +            next if !defined $lines_ar->[$idx];
> > > > > +
> > > > > +            $str .= $lines_ar->[$idx];
> > > > > +        }
> > > > > +    }
> > > > > +
> > > > > +    return \$str;
> > > > > +}
> > > > > +
> > > > > +#----------------------------------------------------------------------
> > > > > +
> > > > > +=head2 ($main, $sub, $encoding, $checkval) = parse_content_type( $CONTENT_TYPE )
> > > > > +
> > > > > +=cut
> > > > > +
> > > > > +use constant _QUOTED_PRINTABLE_NAMES => (
> > > > > +    "quopri-codec",
> > > > > +    "quopri",
> > > > > +    "quoted-printable",
> > > > > +    "quotedprintable",
> > > > > +);
> > > > > +
> > > > > +# Make Encode::decode() ignore anything that doesn???t fit the
> > > > > +# given encoding.
> > > > > +use constant _encode_check_ignore => q<>;
> > > > > +
> > > > > +sub parse_content_type {
> > > > > +    my ($content_type) = @_;
> > > > > +
> > > > > +    $Email::MIME::ContentType::STRICT_PARAMS = 0;
> > > > > +    my $ct_parse = Email::MIME::ContentType::parse_content_type(
> > > > > +        $content_type,
> > > > > +    );
> > > > > +
> > > > > +    my $main = $ct_parse->{'type'}    || q<>;
> > > > > +    my $sub  = $ct_parse->{'subtype'} || q<>;
> > > > > +
> > > > > +    my $encoding = $ct_parse->{'attributes'}{'charset'};
> > > > > +
> > > > > +    my $checkval;
> > > > > +
> > > > > +    if ($encoding) {
> > > > > +
> > > > > +        # Lower-case everything, convert underscore to dash, and remove NUL.
> > > > > +        $encoding =~ tr<A-Z_\0><a-z->d;
> > > > > +
> > > > > +        # Apparently pyzor accommodates messages that put the transfer
> > > > > +        # encoding in the Content-Type.
> > > > > +        if ( grep { $_ eq $encoding } _QUOTED_PRINTABLE_NAMES() ) {
> > > > > +            $checkval = Encode::FB_CROAK();
> > > > > +        }
> > > > > +    }
> > > > > +    else {
> > > > > +        $encoding = 'ascii';
> > > > > +    }
> > > > > +
> > > > > +    # Match Python .decode()???s 'ignore' behavior
> > > > > +    $checkval ||= \&_encode_check_ignore;
> > > > > +
> > > > > +    return ( $main, $sub, $encoding, $checkval );
> > > > > +}
> > > > > +
> > > > > +#----------------------------------------------------------------------
> > > > > +
> > > > > +=head2 @lines = splitlines( $TEXT )
> > > > > +
> > > > > +Imitates C<str.splitlines()>. (cf. C<pydoc str>)
> > > > > +
> > > > > +Returns a plain list in list context. Returns the number of
> > > > > +items to be returned in scalar context.
> > > > > +
> > > > > +=cut
> > > > > +
> > > > > +sub splitlines {
> > > > > +    return split m<\r\n?|\n>, $_[0];
> > > > > +}
> > > > > +
> > > > > +1;
> > > > > diff --git a/lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm b/lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm
> > > > > new file mode 100644
> > > > > index 0000000..2617b4a
> > > > > --- /dev/null
> > > > > +++ b/lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm
> > > > > @@ -0,0 +1,177 @@
> > > > > +package Mail::SpamAssassin::Pyzor::Digest::StripHtml;
> > > > > +
> > > > > +# Copyright 2018 cPanel, LLC.
> > > > > +# All rights reserved.
> > > > > +# http://cpanel.net
> > > > > +#
> > > > > +# <@LICENSE>
> > > > > +# Licensed to the Apache Software Foundation (ASF) under one or more
> > > > > +# contributor license agreements.  See the NOTICE file distributed with
> > > > > +# this work for additional information regarding copyright ownership.
> > > > > +# The ASF licenses this file to you under the Apache License, Version 2.0
> > > > > +# (the "License"); you may not use this file except in compliance with
> > > > > +# the License.  You may obtain a copy of the License at:
> > > > > +#
> > > > > +#     http://www.apache.org/licenses/LICENSE-2.0
> > > > > +#
> > > > > +# Unless required by applicable law or agreed to in writing, software
> > > > > +# distributed under the License is distributed on an "AS IS" BASIS,
> > > > > +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> > > > > +# See the License for the specific language governing permissions and
> > > > > +# limitations under the License.
> > > > > +# </...@LICENSE>
> > > > > +#
> > > > > +
> > > > > +use strict;
> > > > > +use warnings;
> > > > > +
> > > > > +=encoding utf-8
> > > > > +
> > > > > +=head1 NAME
> > > > > +
> > > > > +Mail::SpamAssassin::Pyzor::Digest::StripHtml
> > > > > +
> > > > > +=head1 SYNOPSIS
> > > > > +
> > > > > +    my $stripped = Mail::SpamAssassin::Pyzor::Digest::StripHtml::strip($html);
> > > > > +
> > > > > +=head1 DESCRIPTION
> > > > > +
> > > > > +This module attempts to duplicate pyzor???s HTML-stripping logic.
> > > > > +
> > > > > +=head1 ACCURACY
> > > > > +
> > > > > +This library cannot achieve 100%, bug-for-bug parity with pyzor
> > > > > +because to do so would require duplicating Python???s own HTML parsing
> > > > > +library. Since that library???s output has changed over time, and those
> > > > > +changes in turn affect pyzor, it???s literally impossible to arrive at
> > > > > +a single, fully-compatible reimplementation.
> > > > > +
> > > > > +That said, all known divergences between pyzor and this library involve
> > > > > +invalid HTML as input.
> > > > > +
> > > > > +Please open bug reports for any divergences you identify, particularly
> > > > > +if the input is valid HTML.
> > > > > +
> > > > > +=cut
> > > > > +
> > > > > +#----------------------------------------------------------------------
> > > > > +
> > > > > +use HTML::Parser ();
> > > > > +
> > > > > +our $VERSION = '0.03';
> > > > > +
> > > > > +#----------------------------------------------------------------------
> > > > > +
> > > > > +=head1 FUNCTIONS
> > > > > +
> > > > > +=head2 $stripped = strip( $HTML )
> > > > > +
> > > > > +Give it some HTML, and it???ll give back the stripped text.
> > > > > +
> > > > > +In B<general>, the stripping consists of removing tags as well as
> > > > > +C<E<lt>scriptE<gt>> and C<E<lt>styleE<gt>> elements; however, it also
> > > > > +removes HTML entities.
> > > > > +
> > > > > +This tries very hard to duplicate pyzor???s behavior with invalid HTML.
> > > > > +
> > > > > +=cut
> > > > > +
> > > > > +sub strip {
> > > > > +    my ($html) = @_;
> > > > > +
> > > > > +    $html =~ s<\A\s+><>;
> > > > > +    $html =~ s<\s+\z><>;
> > > > > +
> > > > > +    my $p = HTML::Parser->new( api_version => 3 );
> > > > > +
> > > > > +    my @pieces;
> > > > > +
> > > > > +    my $accumulate = 1;
> > > > > +
> > > > > +    $p->handler(
> > > > > +        start => sub {
> > > > > +            my ($tagname) = @_;
> > > > > +
> > > > > +            $accumulate = 0 if $tagname eq 'script';
> > > > > +            $accumulate = 0 if $tagname eq 'style';
> > > > > +
> > > > > +            return;
> > > > > +        },
> > > > > +        'tagname',
> > > > > +    );
> > > > > +
> > > > > +    $p->handler(
> > > > > +        end => sub {
> > > > > +            $accumulate = 1;
> > > > > +            return;
> > > > > +        }
> > > > > +    );
> > > > > +
> > > > > +    $p->handler(
> > > > > +        text => sub {
> > > > > +            my ($copy) = @_;
> > > > > +
> > > > > +            return if !$accumulate;
> > > > > +
> > > > > +            # pyzor???s HTML parser discards HTML entities. On top of that,
> > > > > +            # we need to match, as closely as possible, pyzor???s handling of
> > > > > +            # invalid HTML entities ??? which is a function of Python???s
> > > > > +            # standard HTML parsing library. This will probably never be
> > > > > +            # fully compatible with the pyzor, but we can get it close.
> > > > > +
> > > > > +            # The original is:
> > > > > +            #
> > > > > +            #   re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
> > > > > +            #
> > > > > +            # The parsing loop then ???backs up??? one byte if the last
> > > > > +            # character isn???t a ???;???. We use a look-ahead assertion to
> > > > > +            # mimic that behavior.
> > > > > +            $copy =~ s<\&\# (?:[0-9]+ | [xX][0-9a-fA-F]+) (?: ; | \z | (?=[^0-9a-fA-F]) )>< >gx;
> > > > > +
> > > > > +            # The original is:
> > > > > +            #
> > > > > +            #   re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
> > > > > +            #
> > > > > +            # We again use a look-ahead assertion to mimic Python.
> > > > > +            $copy =~ s<\& [a-zA-Z] [-.a-zA-Z0-9]* (?: ; | \z | (?=[^a-zA-Z0-9]) )>< >gx;
> > > > > +
> > > > > +            # Python???s HTMLParser aborts its parsing loop when it encounters
> > > > > +            # an invalid numeric reference.
> > > > > +            $copy =~ s<\&\#
> > > > > +                (?:
> > > > > +                    [^0-9xX]        # anything but the expected first char
> > > > > +                    |
> > > > > +                    [0-9]+[a-fA-F]  # hex within decimal
> > > > > +                    |
> > > > > +                    [xX][^0-9a-fA-F]
> > > > > +                )
> > > > > +                (.*)
> > > > > +            ><
> > > > > +                ( -1 == index($1, ';') ) ? q<> : '&#'
> > > > > +            >exs;
> > > > > +
> > > > > +            # Python???s HTMLParser treats invalid entities as incomplete
> > > > > +            $copy =~ s<(\&\#?)><$1 >gx;
> > > > > +
> > > > > +            $copy =~ s<\A\s+><>;
> > > > > +            $copy =~ s<\s+\z><>;
> > > > > +
> > > > > +            push @pieces, \$copy if length $copy;
> > > > > +        },
> > > > > +        'text,tagname',
> > > > > +    );
> > > > > +
> > > > > +    $p->parse($html);
> > > > > +    $p->eof();
> > > > > +
> > > > > +    my $payload = join( q< >, map { $$_ } @pieces );
> > > > > +
> > > > > +    # Convert all sequences of whitespace OTHER THAN non-breaking spaces to
> > > > > +    # plain spaces.
> > > > > +    $payload =~ s<[^\S\x{a0}]+>< >g;
> > > > > +
> > > > > +    return $payload;
> > > > > +}
> > > > > +
> > > > > +1;
> > > > > diff --git a/t/pyzor.t b/t/pyzor.t
> > > > > index 891f38d..e4ef83f 100755
> > > > > --- a/t/pyzor.t
> > > > > +++ b/t/pyzor.t
> > > > > @@ -3,12 +3,9 @@
> > > > >   use lib '.'; use lib 't';
> > > > >   use SATest; sa_t_init("pyzor");
> > > > > -use constant HAS_PYZOR => eval { $_ = untaint_cmd("which pyzor"); chomp; -x };
> > > > > -
> > > > >   use Test::More;
> > > > >   plan skip_all => "Net tests disabled" unless conf_bool('run_net_tests');
> > > > > -plan skip_all => "Pyzor executable not found in path" unless HAS_PYZOR;
> > > > > -plan tests => 8;
> > > > > +plan tests => 5;
> > > > >   diag('Note: Failures may not be an SpamAssassin bug, as Pyzor tests can fail due to problems with the Pyzor servers.');
> > > > > @@ -30,7 +27,7 @@ tstprefs ("
> > > > >   sarun ("-t < data/spam/pyzor", \&patterns_run_cb);
> > > > >   ok_all_patterns();
> > > > >   # Same with fork
> > > > > -sarun ("--cf='pyzor_fork 1' -t < data/spam/pyzor", \&patterns_run_cb);
> > > > > +sarun ("-t < data/spam/pyzor", \&patterns_run_cb);
> > > > >   ok_all_patterns();
> > > > >   #TESTING FOR HAM
> > > > > @@ -44,7 +41,3 @@ ok_all_patterns();
> > > > >   sarun ("-D pyzor -t < data/nice/001 2>&1", \&patterns_run_cb);
> > > > >   ok_all_patterns();
> > > > > -# same with fork
> > > > > -sarun ("-D pyzor --cf='pyzor_fork 1' -t < data/nice/001 2>&1", \&patterns_run_cb);
> > > > > -ok_all_patterns();
> > > > > -
> > > > 
> > 
> -- 
> Kevin A. McGrail
> KMcGrail@Apache.org
> 
> Member, Apache Software Foundation
> Chair Emeritus Apache SpamAssassin Project
> https://www.linkedin.com/in/kmcgrail - 703.798.0171

Re: new Pyzor implementation

Posted by "Kevin A. McGrail" <km...@apache.org>.
No worries there that I know of.

cPanel has the paperwork for CCLA on file and several people with ICLA's 
as well.  They've given us permission to commit the code too.

I think it will be better than any dependency on external binaries.

Regards,

KAM

On 10/14/2021 10:37 AM, Henrik K wrote:
> If that's the case, I probably wouldn't have any objections.  Not sure if it
> requires some Contributor License Agreement from cPanels part (maybe they
> already have one), and I guess atleast a bug to make it official..  Sidney
> or KAM can probably chime in on the admin side..
>
>
> On Thu, Oct 14, 2021 at 04:32:53PM +0200, Giovanni Bechis wrote:
>> Once committed, code will be no more developed by cPanel on CPAN
>> and original code will be removed.
>>
>> I can work to integrate old and new Pyzor versions.
>>
>>   Giovanni
>>
>> On Thu, Oct 14, 2021 at 05:27:16PM +0300, Henrik K wrote:
>>> If it's developed by cPanel in CPAN, then it should not be committed to SA,
>>> unless it's clearly donated to SpamAssassin and removed from CPAN.  Assuming
>>> we have developer resources and will to take it aboard.
>>>
>>> As it is, Plugin/Pyzor.pm should have an option to choose which one to use,
>>> as it makes no sense to ditch support for the widely installed original
>>> Pyzor.
>>>
>>>
>>> On Thu, Oct 14, 2021 at 04:15:13PM +0200, Giovanni Bechis wrote:
>>>> Hi,
>>>> cPanel has developed a native Perl Pyzor implementation for SpamAssassin
>>>> and a diff against SpamAssassin 4.0 follows.
>>>> Atm I am using it in production on a small server, more tests and
>>>> opinions are welcome.
>>>>
>>>> Original cPanel code is at https://metacpan.org/pod/Mail::Pyzor.
>>>>
>>>>   Cheers
>>>>    Giovanni
>>>>
>>>> diff --git a/MANIFEST b/MANIFEST
>>>> index 25d0192..2d9588c 100644
>>>> --- a/MANIFEST
>>>> +++ b/MANIFEST
>>>> @@ -126,6 +126,11 @@ lib/Mail/SpamAssassin/Plugin/WLBLEval.pm
>>>>   lib/Mail/SpamAssassin/Plugin/WhiteListSubject.pm
>>>>   lib/Mail/SpamAssassin/PluginHandler.pm
>>>>   lib/Mail/SpamAssassin/Plugin/URILocalBL.pm
>>>> +lib/Mail/SpamAssassin/Pyzor/Client.pm
>>>> +lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm
>>>> +lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm
>>>> +lib/Mail/SpamAssassin/Pyzor/Digest.pm
>>>> +lib/Mail/SpamAssassin/Pyzor.pm
>>>>   lib/Mail/SpamAssassin/RegistryBoundaries.pm
>>>>   lib/Mail/SpamAssassin/Reporter.pm
>>>>   lib/Mail/SpamAssassin/SQLBasedAddrList.pm
>>>> diff --git a/lib/Mail/SpamAssassin/Plugin/Pyzor.pm b/lib/Mail/SpamAssassin/Plugin/Pyzor.pm
>>>> index 3efd4b4..e4c9c05 100644
>>>> --- a/lib/Mail/SpamAssassin/Plugin/Pyzor.pm
>>>> +++ b/lib/Mail/SpamAssassin/Plugin/Pyzor.pm
>>>> @@ -36,17 +36,13 @@ package Mail::SpamAssassin::Plugin::Pyzor;
>>>>   
>>>>   use Mail::SpamAssassin::Plugin;
>>>>   use Mail::SpamAssassin::Logger;
>>>> -use Mail::SpamAssassin::Timeout;
>>>> -use Mail::SpamAssassin::Util qw(untaint_var untaint_file_path
>>>> -                                proc_status_ok exit_status_str);
>>>> +use Mail::SpamAssassin::Util qw(untaint_var);
>>>> +
>>>>   use strict;
>>>>   use warnings;
>>>>   # use bytes;
>>>>   use re 'taint';
>>>>   
>>>> -use Storable;
>>>> -use POSIX qw(PIPE_BUF WNOHANG _exit);
>>>> -
>>>>   our @ISA = qw(Mail::SpamAssassin::Plugin);
>>>>   
>>>>   sub new {
>>>> @@ -78,7 +74,7 @@ sub set_config {
>>>>     my ($self, $conf) = @_;
>>>>     my @cmds;
>>>>   
>>>> -=head1 USER OPTIONS
>>>> +=head1 ADMINISTRATOR OPTIONS
>>>>   
>>>>   =over 4
>>>>   
>>>> @@ -95,22 +91,7 @@ Whether to use Pyzor, if it is available.
>>>>       type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL
>>>>     });
>>>>   
>>>> -=item pyzor_fork (0|1)		(default: 0)
>>>> -
>>>> -Instead of running Pyzor synchronously, fork separate process for it and
>>>> -read the results in later (similar to async DNS lookups).  Increases
>>>> -throughput.  Experimental.
>>>> -
>>>> -=cut
>>>> -
>>>> -  push(@cmds, {
>>>> -    setting => 'pyzor_fork',
>>>> -    is_admin => 1,
>>>> -    default => 0,
>>>> -    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC,
>>>> -  });
>>>> -
>>>> -=item pyzor_count_min NUMBER	(default: 5)
>>>> +=item pyzor_count_min NUMBER		(default: 5)
>>>>   
>>>>   This option sets how often a message's body checksum must have been
>>>>   reported to the Pyzor server before SpamAssassin will consider the Pyzor
>>>> @@ -128,54 +109,8 @@ set this to a relatively low value, e.g. C<5>.
>>>>       type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
>>>>     });
>>>>   
>>>> -  # Deprecated setting, the name makes no sense!
>>>> -  push (@cmds, {
>>>> -    setting => 'pyzor_max',
>>>> -    is_admin => 1,
>>>> -    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC,
>>>> -    code => sub {
>>>> -      my ($self, $key, $value, $line) = @_;
>>>> -      warn("deprecated setting used, change pyzor_max to pyzor_count_min\n");
>>>> -      if ($value !~ /^\d+$/) {
>>>> -        return $Mail::SpamAssassin::Conf::INVALID_VALUE;
>>>> -      }
>>>> -      $self->{pyzor_count_min} = $value;
>>>> -    }
>>>> -  });
>>>> -
>>>> -=item pyzor_whitelist_min NUMBER	(default: 10)
>>>> -
>>>> -This option sets how often a message's body checksum must have been
>>>> -whitelisted to the Pyzor server for SpamAssassin to consider ignoring the
>>>> -result.  Final decision is made by pyzor_whitelist_factor.
>>>> -
>>>> -=cut
>>>> -
>>>> -  push (@cmds, {
>>>> -    setting => 'pyzor_whitelist_min',
>>>> -    is_admin => 1,
>>>> -    default => 10,
>>>> -    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
>>>> -  });
>>>> -
>>>> -=item pyzor_whitelist_factor NUMBER	(default: 0.2)
>>>> -
>>>> -Ignore Pyzor result if REPORTCOUNT x NUMBER >= pyzor_whitelist_min.
>>>> -For default setting this means: 50 reports requires 10 whitelistings.
>>>> -
>>>> -=cut
>>>> -
>>>> -  push (@cmds, {
>>>> -    setting => 'pyzor_whitelist_factor',
>>>> -    is_admin => 1,
>>>> -    default => 0.2,
>>>> -    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
>>>> -  });
>>>> -
>>>>   =back
>>>>   
>>>> -=head1 ADMINISTRATOR OPTIONS
>>>> -
>>>>   =over 4
>>>>   
>>>>   =item pyzor_timeout n		(default: 5)
>>>> @@ -210,478 +145,182 @@ removing one of them.
>>>>       type => $Mail::SpamAssassin::Conf::CONF_TYPE_DURATION
>>>>     });
>>>>   
>>>> -=item pyzor_options options
>>>> +=item pyzor_whitelist_min NUMBER        (default: 10)
>>>>   
>>>> -Specify additional options to the pyzor(1) command. Please note that only
>>>> -characters in the range [0-9A-Za-z =,._/-] are allowed for security reasons.
>>>> +This option sets how often a message's body checksum must have been
>>>> +whitelisted to the Pyzor server for SpamAssassin to consider ignoring the
>>>> +result.  Final decision is made by pyzor_whitelist_factor.
>>>>   
>>>>   =cut
>>>>   
>>>>     push (@cmds, {
>>>> -    setting => 'pyzor_options',
>>>> +    setting => 'pyzor_whitelist_min',
>>>>       is_admin => 1,
>>>> -    default => '',
>>>> -    type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING,
>>>> -    code => sub {
>>>> -      my ($self, $key, $value, $line) = @_;
>>>> -      if ($value !~ m{^([0-9A-Za-z =,._/-]+)$}) {
>>>> -	return $Mail::SpamAssassin::Conf::INVALID_VALUE;
>>>> -      }
>>>> -      $self->{pyzor_options} = $1;
>>>> -    }
>>>> +    default => 10,
>>>> +    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
>>>>     });
>>>>   
>>>> -=item pyzor_path STRING
>>>> +=item pyzor_whitelist_factor NUMBER     (default: 0.2)
>>>>   
>>>> -This option tells SpamAssassin specifically where to find the C<pyzor>
>>>> -client instead of relying on SpamAssassin to find it in the current
>>>> -PATH.  Note that if I<taint mode> is enabled in the Perl interpreter,
>>>> -you should use this, as the current PATH will have been cleared.
>>>> +Ignore Pyzor result if REPORTCOUNT x NUMBER >= pyzor_whitelist_min.
>>>> +For default setting this means: 50 reports requires 10 whitelistings.
>>>>   
>>>>   =cut
>>>>   
>>>>     push (@cmds, {
>>>> -    setting => 'pyzor_path',
>>>> +    setting => 'pyzor_whitelist_factor',
>>>>       is_admin => 1,
>>>> -    default => undef,
>>>> -    type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING,
>>>> -    code => sub {
>>>> -      my ($self, $key, $value, $line) = @_;
>>>> -      if (!defined $value || !length $value) {
>>>> -	return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE;
>>>> -      }
>>>> -      $value = untaint_file_path($value);
>>>> -      if (!-x $value) {
>>>> -	info("config: pyzor_path \"$value\" isn't an executable");
>>>> -	return $Mail::SpamAssassin::Conf::INVALID_VALUE;
>>>> -      }
>>>> -
>>>> -      $self->{pyzor_path} = $value;
>>>> -    }
>>>> +    default => 0.2,
>>>> +    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
>>>>     });
>>>>   
>>>>     $conf->{parser}->register_commands(\@cmds);
>>>>   }
>>>>   
>>>>   sub is_pyzor_available {
>>>> -  my ($self) = @_;
>>>> +    my ($self) = @_;
>>>>   
>>>> -  my $pyzor = $self->{main}->{conf}->{pyzor_path} ||
>>>> -    Mail::SpamAssassin::Util::find_executable_in_env_path('pyzor');
>>>> -
>>>> -  unless ($pyzor && -x $pyzor) {
>>>> -    dbg("pyzor: no pyzor executable found");
>>>> -    $self->{pyzor_available} = 0;
>>>> -    return 0;
>>>> -  }
>>>> -
>>>> -  # remember any found pyzor
>>>> -  $self->{main}->{conf}->{pyzor_path} = $pyzor;
>>>> -
>>>> -  dbg("pyzor: pyzor is available: $pyzor");
>>>> -  return 1;
>>>> +    local $@;
>>>> +    eval {
>>>> +        require Mail::SpamAssassin::Pyzor::Digest;
>>>> +        require Mail::SpamAssassin::Pyzor::Client;
>>>> +    };
>>>> +    return $@ ? 0 : 1;
>>>>   }
>>>>   
>>>> -sub finish_parsing_start {
>>>> -  my ($self, $opts) = @_;
>>>> +sub get_pyzor_interface {
>>>> +  my ($self) = @_;
>>>>   
>>>> -  # If forking, hard adjust priority -100 to launch early
>>>> -  # Find rulenames from eval_to_rule mappings
>>>> -  if ($opts->{conf}->{pyzor_fork}) {
>>>> -    foreach (@{$opts->{conf}->{eval_to_rule}->{check_pyzor}}) {
>>>> -      dbg("pyzor: adjusting rule $_ priority to -100");
>>>> -      $opts->{conf}->{priority}->{$_} = -100;
>>>> -    }
>>>> +  if (!$self->{main}->{conf}->{use_pyzor}) {
>>>> +    dbg("pyzor: use_pyzor option not enabled, disabling Pyzor");
>>>> +    $self->{pyzor_interface} = "disabled";
>>>> +    $self->{pyzor_available} = 0;
>>>> +  }
>>>> +  elsif ($self->is_pyzor_available()) {
>>>> +    $self->{pyzor_interface} = "pyzor";
>>>> +    $self->{pyzor_available} = 1;
>>>> +  }
>>>> +  else {
>>>> +    dbg("pyzor: no pyzor found, disabling Pyzor");
>>>> +    $self->{pyzor_available} = 0;
>>>>     }
>>>>   }
>>>>   
>>>>   sub check_pyzor {
>>>> -  my ($self, $pms, $full) = @_;
>>>> -
>>>> -  return 0 if !$self->{pyzor_available};
>>>> -  return 0 if !$self->{main}->{conf}->{use_pyzor};
>>>> -
>>>> -  return 0 if $pms->{pyzor_running};
>>>> -  $pms->{pyzor_running} = 1;
>>>> -
>>>> -  return 0 if !$self->is_pyzor_available();
>>>> -
>>>> -  my $timer = $self->{main}->time_method("check_pyzor");
>>>> +  my ($self, $permsgstatus, $full) = @_;
>>>>   
>>>>     # initialize valid tags
>>>> -  $pms->{tag_data}->{PYZOR} = '';
>>>> -
>>>> -  # create fulltext tmpfile now (before possible forking)
>>>> -  $pms->{pyzor_tmpfile} = $pms->create_fulltext_tmpfile();
>>>> -
>>>> -  ## non-forking method
>>>> -
>>>> -  if (!$self->{main}->{conf}->{pyzor_fork}) {
>>>> -    my @results = $self->pyzor_lookup($pms);
>>>> -    return $self->_check_result($pms, \@results);
>>>> -  }
>>>> -
>>>> -  ## forking method
>>>> -
>>>> -  $pms->{pyzor_rulename} = $pms->get_current_eval_rule_name();
>>>> -  $pms->rule_pending($pms->{pyzor_rulename}); # mark async
>>>> -
>>>> -  # create socketpair for communication
>>>> -  $pms->{pyzor_backchannel} = Mail::SpamAssassin::SubProcBackChannel->new();
>>>> -  my $back_selector = '';
>>>> -  $pms->{pyzor_backchannel}->set_selector(\$back_selector);
>>>> -  eval {
>>>> -    $pms->{pyzor_backchannel}->setup_backchannel_parent_pre_fork();
>>>> -  } or do {
>>>> -    dbg("pyzor: backchannel pre-setup failed: $@");
>>>> -    delete $pms->{pyzor_backchannel};
>>>> -    return 0;
>>>> -  };
>>>> +  $permsgstatus->{tag_data}->{PYZOR} = "";
>>>>   
>>>> -  my $pid = fork();
>>>> -  if (!defined $pid) {
>>>> -    info("pyzor: child fork failed: $!");
>>>> -    delete $pms->{pyzor_backchannel};
>>>> -    return 0;
>>>> -  }
>>>> -  if (!$pid) {
>>>> -    $0 = "$0 (pyzor)";
>>>> -    $SIG{CHLD} = 'DEFAULT';
>>>> -    $SIG{PIPE} = 'IGNORE';
>>>> -    $SIG{$_} = sub {
>>>> -      eval { dbg("pyzor: child process $$ caught signal $_[0]"); };
>>>> -      _exit(6);  # avoid END and destructor processing
>>>> -      kill('KILL',$$);  # still kicking? die!
>>>> -      } foreach qw(INT HUP TERM TSTP QUIT USR1 USR2);
>>>> -    dbg("pyzor: child process $$ forked");
>>>> -    $pms->{pyzor_backchannel}->setup_backchannel_child_post_fork();
>>>> -    my @results = $self->pyzor_lookup($pms);
>>>> -    my $backmsg;
>>>> -    eval {
>>>> -      $backmsg = Storable::freeze(\@results);
>>>> -    };
>>>> -    if ($@) {
>>>> -      dbg("pyzor: child return value freeze failed: $@");
>>>> -      _exit(0); # avoid END and destructor processing
>>>> -    }
>>>> -    if (!syswrite($pms->{pyzor_backchannel}->{parent}, $backmsg)) {
>>>> -      dbg("pyzor: child backchannel write failed: $!");
>>>> -    }
>>>> -    _exit(0); # avoid END and destructor processing
>>>> -  }
>>>> -
>>>> -  $pms->{pyzor_pid} = $pid;
>>>> +  my $timer = $self->{main}->time_method("check_pyzor");
>>>>   
>>>> -  eval {
>>>> -    $pms->{pyzor_backchannel}->setup_backchannel_parent_post_fork($pid);
>>>> -  } or do {
>>>> -    dbg("pyzor: backchannel post-setup failed: $@");
>>>> -    delete $pms->{pyzor_backchannel};
>>>> -    return 0;
>>>> -  };
>>>> +  $self->get_pyzor_interface();
>>>> +  return 0 unless $self->{pyzor_available};
>>>>   
>>>> -  return 0;
>>>> +  return $self->pyzor_lookup($permsgstatus, $full);
>>>>   }
>>>>   
>>>>   sub pyzor_lookup {
>>>> -  my ($self, $pms) = @_;
>>>> -
>>>> -  my $conf = $self->{main}->{conf};
>>>> -  my $timeout = $conf->{pyzor_timeout};
>>>> -
>>>> -  # note: not really tainted, this came from system configuration file
>>>> -  my $path = untaint_file_path($conf->{pyzor_path});
>>>> -  my $opts = untaint_var($conf->{pyzor_options}) || '';
>>>> -
>>>> -  $pms->enter_helper_run_mode();
>>>> -
>>>> -  my $pid;
>>>> -  my @resp;
>>>> -  my $timer = Mail::SpamAssassin::Timeout->new(
>>>> -           { secs => $timeout, deadline => $pms->{master_deadline} });
>>>> -  my $err = $timer->run_and_catch(sub {
>>>> -    local $SIG{PIPE} = sub { die "__brokenpipe__ignore__\n" };
>>>> -
>>>> -    dbg("pyzor: opening pipe: ".
>>>> -      join(' ', $path, $opts, "check", "<".$pms->{pyzor_tmpfile}));
>>>> -
>>>> -    $pid = Mail::SpamAssassin::Util::helper_app_pipe_open(*PYZOR,
>>>> -	$pms->{pyzor_tmpfile}, 1, $path, split(' ', $opts), "check");
>>>> -    $pid or die "$!\n";
>>>> -
>>>> -    # read+split avoids a Perl I/O bug (Bug 5985)
>>>> -    my($inbuf, $nread);
>>>> -    my $resp = '';
>>>> -    while ($nread = read(PYZOR, $inbuf, 8192)) { $resp .= $inbuf }
>>>> -    defined $nread  or die "error reading from pipe: $!";
>>>> -    @resp = split(/^/m, $resp, -1);
>>>> -
>>>> -    my $errno = 0;
>>>> -    close PYZOR or $errno = $!;
>>>> -    if (proc_status_ok($?, $errno)) {
>>>> -      dbg("pyzor: [%s] finished successfully", $pid);
>>>> -    } elsif (proc_status_ok($?, $errno, 0, 1)) {  # sometimes it exits with 1
>>>> -      dbg("pyzor: [%s] finished: %s", $pid, exit_status_str($?, $errno));
>>>> -    } else {
>>>> -      info("pyzor: [%s] error: %s", $pid, exit_status_str($?, $errno));
>>>> -    }
>>>> -
>>>> -  });
>>>> -
>>>> -  if (defined(fileno(*PYZOR))) {  # still open
>>>> -    if ($pid) {
>>>> -      if (kill('TERM', $pid)) {
>>>> -        dbg("pyzor: killed stale helper [$pid]");
>>>> -      } else {
>>>> -        dbg("pyzor: killing helper application [$pid] failed: $!");
>>>> -      }
>>>> -    }
>>>> -    my $errno = 0;
>>>> -    close PYZOR or $errno = $!;
>>>> -    proc_status_ok($?, $errno)
>>>> -      or info("pyzor: [%s] error: %s", $pid, exit_status_str($?, $errno));
>>>> -  }
>>>> -
>>>> -  $pms->leave_helper_run_mode();
>>>> -
>>>> -  if ($timer->timed_out()) {
>>>> -    dbg("pyzor: check timed out after $timeout seconds");
>>>> -    return ();
>>>> -  } elsif ($err) {
>>>> -    chomp $err;
>>>> -    info("pyzor: check failed: $err");
>>>> -    return ();
>>>> -  }
>>>> -
>>>> -  return @resp;
>>>> -}
>>>> -
>>>> -sub check_tick {
>>>> -  my ($self, $opts) = @_;
>>>> -  $self->_check_forked_result($opts->{permsgstatus}, 0);
>>>> -}
>>>> -
>>>> -sub check_cleanup {
>>>> -  my ($self, $opts) = @_;
>>>> -  $self->_check_forked_result($opts->{permsgstatus}, 1);
>>>> -}
>>>> -
>>>> -sub _check_forked_result {
>>>> -  my ($self, $pms, $finish) = @_;
>>>> -
>>>> -  return 0 if !$pms->{pyzor_backchannel};
>>>> -  return 0 if !$pms->{pyzor_pid};
>>>> +    my ( $self, $permsgstatus, $fulltext ) = @_;
>>>> +    my $conf = $self->{main}->{conf};
>>>> +    my $timeout = $conf->{pyzor_timeout};
>>>> +
>>>> +    my $client = ( $self->{'_pyzor_client'} ||= Mail::SpamAssassin::Pyzor::Client->new( 'timeout' => $timeout ) );
>>>> +    my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $fulltext );
>>>> +
>>>> +    local $@;
>>>> +    my $ref = eval { $client->check($digest); };
>>>> +    dbg("pyzor: got response: $client->{'_server_host'}");
>>>> +    # $client reply must be an hash
>>>> +    return 0 if (not (ref $ref eq ref {}));
>>>> +    if ($@) {
>>>> +        my $err = $@;
>>>>   
>>>> -  my $timer = $self->{main}->time_method("check_pyzor");
>>>> +        $err = eval { $err->get_message() } || $err;
>>>>   
>>>> -  $pms->{pyzor_abort} = $pms->{deadline_exceeded} || $pms->{shortcircuited};
>>>> -
>>>> -  my $kid_pid = $pms->{pyzor_pid};
>>>> -  # if $finish, force waiting for the child
>>>> -  my $pid = waitpid($kid_pid, $finish && !$pms->{pyzor_abort} ? 0 : WNOHANG);
>>>> -  if ($pid == 0) {
>>>> -    #dbg("pyzor: child process $kid_pid not finished yet, trying later");
>>>> -    if ($pms->{pyzor_abort}) {
>>>> -      dbg("pyzor: bailing out due to deadline/shortcircuit");
>>>> -      kill('TERM', $kid_pid);
>>>> -      if (waitpid($kid_pid, WNOHANG) == 0) {
>>>> -        sleep(1);
>>>> -        if (waitpid($kid_pid, WNOHANG) == 0) {
>>>> -          dbg("pyzor: child process $kid_pid still alive, KILL");
>>>> -          kill('KILL', $kid_pid);
>>>> -          waitpid($kid_pid, 0);
>>>> +        warn("pyzor: check failed: $err\n");
>>>> +        return 0;
>>>> +    } elsif ( defined $ref->{'Code'} and $ref->{'Code'} ne 200 ) {
>>>> +        if(defined $ref->{'Code'} and defined $ref->{'Diag'}) {
>>>> +          dbg("pyzor: check failed with invalid code: $ref->{'Code'}: $ref->{'Diag'}");
>>>> +        } else {
>>>> +          dbg("pyzor: check failed with undefined code");
>>>>           }
>>>> -      }
>>>> -      delete $pms->{pyzor_pid};
>>>> -      delete $pms->{pyzor_backchannel};
>>>> +        return 0;
>>>>       }
>>>> -    return 0;
>>>> -  } elsif ($pid == -1) {
>>>> -    # child does not exist?
>>>> -    dbg("pyzor: child process $kid_pid already handled?");
>>>> -    delete $pms->{pyzor_backchannel};
>>>> -    return 0;
>>>> -  }
>>>>   
>>>> -  $pms->rule_ready($pms->{pyzor_rulename}); # mark rule ready for metas
>>>> +    my $pyzor_count       = untaint_var($ref->{'Count'}) + 0;
>>>> +    my $pyzor_whitelisted = untaint_var($ref->{'WL-Count'}) + 0;
>>>> +    my $count_min = $conf->{pyzor_count_min};
>>>> +    my $wl_min = $conf->{pyzor_whitelist_min};
>>>>   
>>>> -  dbg("pyzor: child process $kid_pid finished, reading results");
>>>> +    my $wl_limit = $pyzor_whitelisted >= $wl_min ?
>>>> +      $pyzor_count * $conf->{pyzor_whitelist_factor} : 0;
>>>>   
>>>> -  my $backmsg;
>>>> -  my $ret = sysread($pms->{pyzor_backchannel}->{latest_kid_fh}, $backmsg, PIPE_BUF);
>>>> -  if (!defined $ret || $ret == 0) {
>>>> -    dbg("pyzor: could not read result from child: ".($ret == 0 ? 0 : $!));
>>>> -    delete $pms->{pyzor_backchannel};
>>>> -    return 0;
>>>> -  }
>>>> -
>>>> -  delete $pms->{pyzor_backchannel};
>>>> +    $permsgstatus->set_tag('PYZOR', "Reported $pyzor_count times, whitelisted $pyzor_whitelisted times.");
>>>>   
>>>> -  my $results;
>>>> -  eval {
>>>> -    $results = Storable::thaw($backmsg);
>>>> -  };
>>>> -  if ($@) {
>>>> -    dbg("pyzor: child return value thaw failed: $@");
>>>> -    return;
>>>> -  }
>>>> -
>>>> -  $self->_check_result($pms, $results);
>>>> -}
>>>> +    dbg("pyzor: result: COUNT=$pyzor_count/$count_min WHITELIST=$pyzor_whitelisted/$wl_min/%.1f",
>>>> +      $wl_limit);
>>>>   
>>>> -sub _check_result {
>>>> -  my ($self, $pms, $results) = @_;
>>>> -
>>>> -  if (!@$results) {
>>>> -    dbg("pyzor: no response from server");
>>>> -    return 0;
>>>> -  }
>>>> -
>>>> -  my $count = 0;
>>>> -  my $count_wl = 0;
>>>> -  foreach my $res (@$results) {
>>>> -    chomp($res);
>>>> -    if ($res =~ /^Traceback/) {
>>>> -      info("pyzor: internal error, python traceback seen in response: $res");
>>>> +    # Empty body etc results in same hash, we should skip very large numbers..
>>>> +    if ($pyzor_count >= 1000000 || $pyzor_whitelisted >= 10000) {
>>>> +      dbg("pyzor: result exceeded hardcoded limits, ignoring: count/wl 1000000/10000");
>>>>         return 0;
>>>>       }
>>>> -    dbg("pyzor: got response: $res");
>>>> -    # this regexp is intended to be a little bit forgiving
>>>> -    if ($res =~ /^\S+\t.*?\t(\d+)\t(\d+)\s*$/) {
>>>> -      # until pyzor servers can sync their DBs,
>>>> -      # sum counts obtained from all servers
>>>> -      $count += untaint_var($1)+0; # crazy but needs untainting
>>>> -      $count_wl += untaint_var($2)+0;
>>>> -    } else {
>>>> -      # warn on failures to parse
>>>> -      info("pyzor: failure to parse response \"$res\"");
>>>> -    }
>>>> -  }
>>>> -
>>>> -  my $conf = $self->{main}->{conf};
>>>> -
>>>> -  my $count_min = $conf->{pyzor_count_min};
>>>> -  my $wl_min = $conf->{pyzor_whitelist_min};
>>>>   
>>>> -  my $wl_limit = $count_wl >= $wl_min ?
>>>> -    $count * $conf->{pyzor_whitelist_factor} : 0;
>>>> -
>>>> -  dbg("pyzor: result: COUNT=$count/$count_min WHITELIST=$count_wl/$wl_min/%.1f",
>>>> -    $wl_limit);
>>>> -  $pms->set_tag('PYZOR', "Reported $count times, whitelisted $count_wl times.");
>>>> -
>>>> -  # Empty body etc results in same hash, we should skip very large numbers..
>>>> -  if ($count >= 1000000 || $count_wl >= 10000) {
>>>> -    dbg("pyzor: result exceeded hardcoded limits, ignoring: count/wl 1000000/10000");
>>>> -    return 0;
>>>> -  }
>>>> -
>>>> -  # Whitelisted?
>>>> -  if ($wl_limit && $count_wl >= $wl_limit) {
>>>> -    dbg("pyzor: message whitelisted");
>>>> -    return 0;
>>>> -  }
>>>> +    # Whitelisted?
>>>> +    if ($wl_limit && $pyzor_whitelisted >= $wl_limit) {
>>>> +      dbg("pyzor: message whitelisted");
>>>> +      return 0;
>>>> +    }
>>>>   
>>>> -  if ($count >= $count_min) {
>>>> -    if ($conf->{pyzor_fork}) {
>>>> -      # forked needs to run got_hit()
>>>> -      $pms->got_hit($pms->{pyzor_rulename}, "", ruletype => 'eval');
>>>> +    if ( $pyzor_count >= $count_min ) {
>>>> +      return 1;
>>>>       }
>>>> -    return 1;
>>>> -  }
>>>>   
>>>> -  return 0;
>>>> +    return 0;
>>>>   }
>>>>   
>>>>   sub plugin_report {
>>>>     my ($self, $options) = @_;
>>>>   
>>>> -  return if !$self->{pyzor_available};
>>>> -  return if !$self->{main}->{conf}->{use_pyzor};
>>>> -  return if $options->{report}->{options}->{dont_report_to_pyzor};
>>>> -  return if !$self->is_pyzor_available();
>>>> -
>>>> -  # use temporary file: open2() is unreliable due to buffering under spamd
>>>> -  my $tmpf = $options->{report}->create_fulltext_tmpfile($options->{text});
>>>> -  if ($self->pyzor_report($options, $tmpf)) {
>>>> -    $options->{report}->{report_available} = 1;
>>>> -    info("reporter: spam reported to Pyzor");
>>>> -    $options->{report}->{report_return} = 1;
>>>> -  }
>>>> -  else {
>>>> -    info("reporter: could not report spam to Pyzor");
>>>> -  }
>>>> -  $options->{report}->delete_fulltext_tmpfile($tmpf);
>>>> +  return unless $self->{pyzor_available};
>>>> +  return unless $self->{main}->{conf}->{use_pyzor};
>>>>   
>>>> -  return 1;
>>>> +  if (!$options->{report}->{options}->{dont_report_to_pyzor} && $self->is_pyzor_available())
>>>> +  {
>>>> +    if ($self->pyzor_report($options)) {
>>>> +      $options->{report}->{report_available} = 1;
>>>> +      info("reporter: spam reported to Pyzor");
>>>> +      $options->{report}->{report_return} = 1;
>>>> +    }
>>>> +    else {
>>>> +      info("reporter: could not report spam to Pyzor");
>>>> +    }
>>>> +  }
>>>>   }
>>>>   
>>>>   sub pyzor_report {
>>>> -  my ($self, $options, $tmpf) = @_;
>>>> -
>>>> -  # note: not really tainted, this came from system configuration file
>>>> -  my $path = untaint_file_path($options->{report}->{conf}->{pyzor_path});
>>>> -  my $opts = untaint_var($options->{report}->{conf}->{pyzor_options}) || '';
>>>> +    my ( $self, $options ) = @_;
>>>>   
>>>> -  my $timeout = $self->{main}->{conf}->{pyzor_timeout};
>>>> +    my $timeout = $self->{main}->{conf}->{pyzor_timeout};
>>>>   
>>>> -  $options->{report}->enter_helper_run_mode();
>>>> +    my $client = ( $self->{'_pyzor_client'} ||= Mail::SpamAssassin::Pyzor::Client->new( 'timeout' => $timeout ) );
>>>>   
>>>> -  my $timer = Mail::SpamAssassin::Timeout->new({ secs => $timeout });
>>>> -  my $err = $timer->run_and_catch(sub {
>>>> +    my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $options->{'text'} );
>>>>   
>>>> -    local $SIG{PIPE} = sub { die "__brokenpipe__ignore__\n" };
>>>> -
>>>> -    dbg("pyzor: opening pipe: " . join(' ', $path, $opts, "report", "< $tmpf"));
>>>> -
>>>> -    my $pid = Mail::SpamAssassin::Util::helper_app_pipe_open(*PYZOR,
>>>> -	$tmpf, 1, $path, split(' ', $opts), "report");
>>>> -    $pid or die "$!\n";
>>>> -
>>>> -    my($inbuf,$nread,$nread_all); $nread_all = 0;
>>>> -    # response is ignored, just check its existence
>>>> -    while ( $nread=read(PYZOR,$inbuf,8192) ) { $nread_all += $nread }
>>>> -    defined $nread  or die "error reading from pipe: $!";
>>>> -
>>>> -    dbg("pyzor: empty response")  if $nread_all < 1;
>>>> -
>>>> -    my $errno = 0;  close PYZOR or $errno = $!;
>>>> -    # closing a pipe also waits for the process executing on the pipe to
>>>> -    # complete, no need to explicitly call waitpid
>>>> -    # my $child_stat = waitpid($pid,0) > 0 ? $? : undef;
>>>> -    if (proc_status_ok($?,$errno, 0)) {
>>>> -      dbg("pyzor: [%s] reporter finished successfully", $pid);
>>>> -    } else {
>>>> -      info("pyzor: [%s] reporter error: %s", $pid, exit_status_str($?,$errno));
>>>> +    local $@;
>>>> +    my $ref = eval { $client->report($digest); };
>>>> +    if ($@) {
>>>> +        warn("pyzor: report failed: $@");
>>>> +        return 0;
>>>>       }
>>>> -
>>>> -  });
>>>> -
>>>> -  $options->{report}->leave_helper_run_mode();
>>>> -
>>>> -  if ($timer->timed_out()) {
>>>> -    dbg("reporter: pyzor report timed out after $timeout seconds");
>>>> -    return 0;
>>>> -  }
>>>> -
>>>> -  if ($err) {
>>>> -    chomp $err;
>>>> -    if ($err eq '__brokenpipe__ignore__') {
>>>> -      dbg("reporter: pyzor report failed: broken pipe");
>>>> -    } else {
>>>> -      warn("reporter: pyzor report failed: $err\n");
>>>> +    elsif ( $ref->{'Code'} ne 200 ) {
>>>> +        dbg("pyzor: report failed with invalid code: $ref->{'Code'}: $ref->{'Diag'}");
>>>> +        return 0;
>>>>       }
>>>> -    return 0;
>>>> -  }
>>>>   
>>>> -  return 1;
>>>> +    return 1;
>>>>   }
>>>>   
>>>> -# Version features
>>>> -sub has_fork { 1 }
>>>> -
>>>>   1;
>>>> -
>>>> -=back
>>>> -
>>>> -=cut
>>>> diff --git a/lib/Mail/SpamAssassin/Pyzor.pm b/lib/Mail/SpamAssassin/Pyzor.pm
>>>> new file mode 100644
>>>> index 0000000..8ac27f4
>>>> --- /dev/null
>>>> +++ b/lib/Mail/SpamAssassin/Pyzor.pm
>>>> @@ -0,0 +1,56 @@
>>>> +package Mail::SpamAssassin::Pyzor;
>>>> +
>>>> +# Copyright 2018 cPanel, LLC.
>>>> +# All rights reserved.
>>>> +# http://cpanel.net
>>>> +#
>>>> +# <@LICENSE>
>>>> +# Licensed to the Apache Software Foundation (ASF) under one or more
>>>> +# contributor license agreements.  See the NOTICE file distributed with
>>>> +# this work for additional information regarding copyright ownership.
>>>> +# The ASF licenses this file to you under the Apache License, Version 2.0
>>>> +# (the "License"); you may not use this file except in compliance with
>>>> +# the License.  You may obtain a copy of the License at:
>>>> +#
>>>> +#     http://www.apache.org/licenses/LICENSE-2.0
>>>> +#
>>>> +# Unless required by applicable law or agreed to in writing, software
>>>> +# distributed under the License is distributed on an "AS IS" BASIS,
>>>> +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>>>> +# See the License for the specific language governing permissions and
>>>> +# limitations under the License.
>>>> +# </...@LICENSE>
>>>> +#
>>>> +
>>>> +use strict;
>>>> +use warnings;
>>>> +
>>>> +our $VERSION = '0.06_01';
>>>> +
>>>> +=encoding utf-8
>>>> +
>>>> +=head1 NAME
>>>> +
>>>> +Mail::SpamAssassin::Pyzor - Pyzor spam filtering in Perl
>>>> +
>>>> +=head1 DESCRIPTION
>>>> +
>>>> +This distribution contains Perl implementations of parts of
>>>> +L<Pyzor|http://pyzor.org>, a tool for use in spam email filtering.
>>>> +It is intended for use with L<Mail::SpamAssassin> but may be useful
>>>> +in other contexts.
>>>> +
>>>> +See the following modules for information on specific tools that
>>>> +the distribution includes:
>>>> +
>>>> +=over
>>>> +
>>>> +=item * L<Mail::SpamAssassin::Pyzor::Client>
>>>> +
>>>> +=item * L<Mail::SpamAssassin::Pyzor::Digest>
>>>> +
>>>> +=back
>>>> +
>>>> +=cut
>>>> +
>>>> +1;
>>>> diff --git a/lib/Mail/SpamAssassin/Pyzor/Client.pm b/lib/Mail/SpamAssassin/Pyzor/Client.pm
>>>> new file mode 100644
>>>> index 0000000..ccff868
>>>> --- /dev/null
>>>> +++ b/lib/Mail/SpamAssassin/Pyzor/Client.pm
>>>> @@ -0,0 +1,415 @@
>>>> +package Mail::SpamAssassin::Pyzor::Client;
>>>> +
>>>> +# Copyright 2018 cPanel, LLC.
>>>> +# All rights reserved.
>>>> +# http://cpanel.net
>>>> +#
>>>> +# <@LICENSE>
>>>> +# Licensed to the Apache Software Foundation (ASF) under one or more
>>>> +# contributor license agreements.  See the NOTICE file distributed with
>>>> +# this work for additional information regarding copyright ownership.
>>>> +# The ASF licenses this file to you under the Apache License, Version 2.0
>>>> +# (the "License"); you may not use this file except in compliance with
>>>> +# the License.  You may obtain a copy of the License at:
>>>> +#
>>>> +#     http://www.apache.org/licenses/LICENSE-2.0
>>>> +#
>>>> +# Unless required by applicable law or agreed to in writing, software
>>>> +# distributed under the License is distributed on an "AS IS" BASIS,
>>>> +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>>>> +# See the License for the specific language governing permissions and
>>>> +# limitations under the License.
>>>> +# </...@LICENSE>
>>>> +#
>>>> +
>>>> +use strict;
>>>> +use warnings;
>>>> +
>>>> +=encoding utf-8
>>>> +
>>>> +=head1 NAME
>>>> +
>>>> +Mail::SpamAssassin::Pyzor::Client - Pyzor client logic
>>>> +
>>>> +=head1 SYNOPSIS
>>>> +
>>>> +    use Mail::SpamAssassin::Pyzor::Client ();
>>>> +    use Mail::SpamAssassin::Pyzor::Digest ();
>>>> +
>>>> +    my $client = Mail::SpamAssassin::Pyzor::Client->new();
>>>> +
>>>> +    my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $msg );
>>>> +
>>>> +    my $check_ref = $client->check($digest);
>>>> +    die $check_ref->{'Diag'} if $check_ref->{'Code'} ne '200';
>>>> +
>>>> +    my $report_ref = $client->report($digest);
>>>> +    die $report_ref->{'Diag'} if $report_ref->{'Code'} ne '200';
>>>> +
>>>> +=head1 DESCRIPTION
>>>> +
>>>> +A bare-bones L<Pyzor|http://pyzor.org> client that currently only
>>>> +implements the functionality needed for L<Mail::SpamAssassin>.
>>>> +
>>>> +=head1 PROTOCOL DETAILS
>>>> +
>>>> +The Pyzor protocol is not a published standard, and there appears to be
>>>> +no meaningful public documentation. What follows is enough information,
>>>> +largely gleaned through forum posts and reverse engineering, to facilitate
>>>> +effective use of this module:
>>>> +
>>>> +Pyzor is an RPC-oriented, message-based protocol. Each message
>>>> +is a simple dictionary of 7-bit ASCII keys and values. Server responses
>>>> +always include at least the following:
>>>> +
>>>> +=over
>>>> +
>>>> +=item * C<Code> - Similar to HTTP status codes; anything besides C<200>
>>>> +is an error.
>>>> +
>>>> +=item * C<Diag> - Similar to HTTP status reasons: a text description
>>>> +of the status.
>>>> +
>>>> +=back
>>>> +
>>>> +(NB: There are additional standard response headers that are useful only for
>>>> +the protocol itself and thus are not part of this module???s returns.)
>>>> +
>>>> +=head2 Reliability
>>>> +
>>>> +Pyzor uses UDP rather than TCP, so no message is guaranteed to reach its
>>>> +destination. A transmission failure can happen in either the request or
>>>> +the response; in either case, a timeout error will result. Such errors
>>>> +are represented as thrown instances of L<Mail::Pyzor::X::Timeout>.
>>>> +
>>>> +=cut
>>>> +
>>>> +#----------------------------------------------------------------------
>>>> +
>>>> +our $VERSION = '0.04';
>>>> +
>>>> +our $DEFAULT_SERVER_HOST    = 'public.pyzor.org';
>>>> +our $DEFAULT_SERVER_PORT    = 24441;
>>>> +our $DEFAULT_USERNAME       = 'anonymous';
>>>> +our $DEFAULT_PASSWORD       = '';
>>>> +our $DEFAULT_OP_SPEC        = '20,3,60,3';
>>>> +our $PYZOR_PROTOCOL_VERSION = 2.1;
>>>> +our $DEFAULT_TIMEOUT        = 3.5;
>>>> +our $READ_SIZE              = 8192;
>>>> +
>>>> +use IO::Socket::INET ();
>>>> +use Digest::SHA qw(sha1 sha1_hex);
>>>> +
>>>> +my @hash_order = ( 'Op', 'Op-Digest', 'Op-Spec', 'Thread', 'PV', 'User', 'Time', 'Sig' );
>>>> +
>>>> +#----------------------------------------------------------------------
>>>> +
>>>> +=head1 CONSTRUCTOR
>>>> +
>>>> +=head2 new(%OPTS)
>>>> +
>>>> +Create a new pyzor client.
>>>> +
>>>> +=over 2
>>>> +
>>>> +=item Input
>>>> +
>>>> +%OPTS are (all optional):
>>>> +
>>>> +=over 3
>>>> +
>>>> +=item * C<server_host> - The pyzor server host to connect to (default is
>>>> +C<public.pyzor.org>)
>>>> +
>>>> +=item * C<server_port> - The pyzor server port to connect to (default is
>>>> +24441)
>>>> +
>>>> +=item * C<username> - The username to present to the pyzor server (default
>>>> +is C<anonymous>)
>>>> +
>>>> +=item * C<password> - The password to present to the pyzor server (default
>>>> +is empty)
>>>> +
>>>> +=item * C<timeout> - The maximum time, in seconds, to wait for a response
>>>> +from the pyzor server (defeault is 3.5)
>>>> +
>>>> +=back
>>>> +
>>>> +=item Output
>>>> +
>>>> +=over 3
>>>> +
>>>> +Returns a L<Mail::SpamAssassin::Pyzor::Client> object.
>>>> +
>>>> +=back
>>>> +
>>>> +=back
>>>> +
>>>> +=cut
>>>> +
>>>> +sub new {
>>>> +    my ( $class, %OPTS ) = @_;
>>>> +
>>>> +    return bless {
>>>> +        '_server_host' => $OPTS{'server_host'} || $DEFAULT_SERVER_HOST,
>>>> +        '_server_port' => $OPTS{'server_port'} || $DEFAULT_SERVER_PORT,
>>>> +        '_username'    => $OPTS{'username'}    || $DEFAULT_USERNAME,
>>>> +        '_password'    => $OPTS{'password'}    || $DEFAULT_PASSWORD,
>>>> +        '_op_spec'     => $DEFAULT_OP_SPEC,
>>>> +        '_timeout'     => $OPTS{'timeout'} || $DEFAULT_TIMEOUT,
>>>> +    }, $class;
>>>> +}
>>>> +
>>>> +#----------------------------------------------------------------------
>>>> +
>>>> +=head1 REQUEST METHODS
>>>> +
>>>> +=head2 report($digest)
>>>> +
>>>> +Report the digest of a spam message to the pyzor server. This function
>>>> +will throw if a messaging failure or timeout happens.
>>>> +
>>>> +=over 2
>>>> +
>>>> +=item Input
>>>> +
>>>> +=over 3
>>>> +
>>>> +=item $digest C<SCALAR>
>>>> +
>>>> +The message digest to report, as given by
>>>> +C<Mail::SpamAssassin::Pyzor::Digest::get()>.
>>>> +
>>>> +=back
>>>> +
>>>> +=item Output
>>>> +
>>>> +=over 3
>>>> +
>>>> +=item C<HASHREF>
>>>> +
>>>> +Returns a hashref of the standard attributes noted above.
>>>> +
>>>> +=back
>>>> +
>>>> +=back
>>>> +
>>>> +=cut
>>>> +
>>>> +sub report {
>>>> +    my ( $self, $digest ) = @_;
>>>> +
>>>> +    my $msg_ref = $self->_get_base_msg( 'report', $digest );
>>>> +
>>>> +    $msg_ref->{'Op-Spec'} = $self->{'_op_spec'};
>>>> +
>>>> +    return $self->_send_receive_msg($msg_ref);
>>>> +}
>>>> +
>>>> +=head2 check($digest)
>>>> +
>>>> +Check the digest of a message to see if
>>>> +the pyzor server has a report for it. This function
>>>> +will throw if a messaging failure or timeout happens.
>>>> +
>>>> +=over 2
>>>> +
>>>> +=item Input
>>>> +
>>>> +=over 3
>>>> +
>>>> +=item $digest C<SCALAR>
>>>> +
>>>> +The message digest to check, as given by
>>>> +C<Mail::SpamAssassin::Pyzor::Digest::get()>.
>>>> +
>>>> +=back
>>>> +
>>>> +=item Output
>>>> +
>>>> +=over 3
>>>> +
>>>> +=item C<HASHREF>
>>>> +
>>>> +Returns a hashref of the standard attributes noted above
>>>> +as well as the following:
>>>> +
>>>> +=over
>>>> +
>>>> +=item * C<Count> - The number of reports the server has received
>>>> +for the given digest.
>>>> +
>>>> +=item * C<WL-Count> - The number of whitelist requests the server has received
>>>> +for the given digest.
>>>> +
>>>> +=back
>>>> +
>>>> +=back
>>>> +
>>>> +=back
>>>> +
>>>> +=cut
>>>> +
>>>> +sub check {
>>>> +    my ( $self, $digest ) = @_;
>>>> +
>>>> +    return $self->_send_receive_msg( $self->_get_base_msg( 'check', $digest ) );
>>>> +}
>>>> +
>>>> +# ----------------------------------------
>>>> +
>>>> +sub _send_receive_msg {
>>>> +    my ( $self, $msg_ref ) = @_;
>>>> +
>>>> +    my $thread_id = $msg_ref->{'Thread'} or warn 'No thread ID?';
>>>> +
>>>> +    $self->_sign_msg($msg_ref);
>>>> +
>>>> +    return $self->_do_send_receive(
>>>> +        $self->_generate_packet_from_message($msg_ref) . "\n\n",
>>>> +        $thread_id,
>>>> +    );
>>>> +}
>>>> +
>>>> +sub _get_base_msg {
>>>> +    my ( $self, $op, $digest ) = @_;
>>>> +
>>>> +    die "Implementor error: op is required" if !$op;
>>>> +    die "error: digest is required"         if !$digest;
>>>> +
>>>> +    return {
>>>> +        'User'      => $self->{'_username'},
>>>> +        'PV'        => $PYZOR_PROTOCOL_VERSION,
>>>> +        'Time'      => time(),
>>>> +        'Op'        => $op,
>>>> +        'Op-Digest' => $digest,
>>>> +        'Thread'    => $self->_generate_thread_id()
>>>> +    };
>>>> +}
>>>> +
>>>> +sub _do_send_receive {
>>>> +    my ( $self, $packet, $thread_id ) = @_;
>>>> +
>>>> +    my $sock = $self->_get_connection_or_die();
>>>> +
>>>> +    $self->_send_packet( $sock, $packet );
>>>> +    my $response = $self->_receive_packet( $sock, $thread_id );
>>>> +
>>>> +    return 0 if not defined $response;
>>>> +
>>>> +    my $resp_hr = { map { ( split(m{: }) )[ 0, 1 ] } split( m{\n}, $response ) };
>>>> +
>>>> +    delete $resp_hr->{'Thread'};
>>>> +
>>>> +    my $response_pv = delete $resp_hr->{'PV'};
>>>> +
>>>> +    if ( $PYZOR_PROTOCOL_VERSION ne $response_pv ) {
>>>> +        warn "Unexpected protocol version ($response_pv) in Pyzor response!";
>>>> +    }
>>>> +
>>>> +    return $resp_hr;
>>>> +}
>>>> +
>>>> +sub _receive_packet {
>>>> +    my ( $self, $sock, $thread_id ) = @_;
>>>> +
>>>> +    my $timeout = $self->{'_timeout'} * 1000;
>>>> +
>>>> +    my $end_time = time + $self->{'_timeout'};
>>>> +
>>>> +    $sock->blocking(0);
>>>> +    my $response = '';
>>>> +    my $rout     = '';
>>>> +    my $rin      = '';
>>>> +    vec( $rin, fileno($sock), 1 ) = 1;
>>>> +
>>>> +    while (1) {
>>>> +        my $time_left = $end_time - time;
>>>> +
>>>> +        if ( $time_left <= 0 ) {
>>>> +          warn("Did not receive a response from the pyzor server $self->{'_server_host'}:$self->{'_server_port'} for $self->{'_timeout'} seconds!");
>>>> +          return;
>>>> +        }
>>>> +
>>>> +        my $bytes = sysread( $sock, $response, $READ_SIZE, length $response );
>>>> +        if ( !defined($bytes) && !$!{'EAGAIN'} && !$!{'EWOULDBLOCK'} ) {
>>>> +            warn "read from socket: $!";
>>>> +        }
>>>> +
>>>> +        if ( index( $response, "\n\n" ) > -1 ) {
>>>> +
>>>> +            # Reject the response unless its thread ID matches what we sent.
>>>> +            # This prevents confusion among concurrent Pyzor reqeusts.
>>>> +            if ( index( $response, "\nThread: $thread_id\n" ) != -1 ) {
>>>> +                last;
>>>> +            }
>>>> +            else {
>>>> +                $response = '';
>>>> +            }
>>>> +        }
>>>> +
>>>> +        my $found = select( $rout = $rin, undef, undef, $time_left );
>>>> +        warn "select(): $!" if $found == -1;
>>>> +    }
>>>> +
>>>> +    return $response;
>>>> +}
>>>> +
>>>> +sub _send_packet {
>>>> +    my ( $self, $sock, $packet ) = @_;
>>>> +
>>>> +    $sock->blocking(1);
>>>> +    syswrite( $sock, $packet ) or warn "write to socket: $!";
>>>> +
>>>> +    return;
>>>> +}
>>>> +
>>>> +sub _get_connection_or_die {
>>>> +    my ($self) = @_;
>>>> +
>>>> +    # clear the socket if the PID changes
>>>> +    if ( defined $self->{'_sock_pid'} && $self->{'_sock_pid'} != $$ ) {
>>>> +        undef $self->{'_sock_pid'};
>>>> +        undef $self->{'_sock'};
>>>> +    }
>>>> +
>>>> +    $self->{'_sock_pid'} ||= $$;
>>>> +    $self->{'_sock'}     ||= IO::Socket::INET->new(
>>>> +        'PeerHost' => $self->{'_server_host'},
>>>> +        'PeerPort' => $self->{'_server_port'},
>>>> +        'Proto'    => 'udp'
>>>> +    ) or die "Cannot connect to $self->{'_server_host'}:$self->{'_server_port'}: $@ $!";
>>>> +
>>>> +    return $self->{'_sock'};
>>>> +}
>>>> +
>>>> +sub _sign_msg {
>>>> +    my ( $self, $msg_ref ) = @_;
>>>> +
>>>> +    $msg_ref->{'Sig'} = lc Digest::SHA::sha1_hex(
>>>> +        Digest::SHA::sha1( $self->_generate_packet_from_message($msg_ref) )
>>>> +    );
>>>> +
>>>> +    return 1;
>>>> +}
>>>> +
>>>> +sub _generate_packet_from_message {
>>>> +    my ( $self, $msg_ref ) = @_;
>>>> +
>>>> +    return join( "\n", map { "$_: $msg_ref->{$_}" } grep { length $msg_ref->{$_} } @hash_order );
>>>> +}
>>>> +
>>>> +sub _generate_thread_id {
>>>> +    my $RAND_MAX = 2**16;
>>>> +    my $val      = 0;
>>>> +    $val = int rand($RAND_MAX) while $val < 1024;
>>>> +    return $val;
>>>> +}
>>>> +
>>>> +sub _get_user_pass_hash_key {
>>>> +    my ($self) = @_;
>>>> +
>>>> +    return lc Digest::SHA::sha1_hex( $self->{'_username'} . ':' . $self->{'_password'} );
>>>> +}
>>>> +
>>>> +1;
>>>> diff --git a/lib/Mail/SpamAssassin/Pyzor/Digest.pm b/lib/Mail/SpamAssassin/Pyzor/Digest.pm
>>>> new file mode 100644
>>>> index 0000000..0e8a5ae
>>>> --- /dev/null
>>>> +++ b/lib/Mail/SpamAssassin/Pyzor/Digest.pm
>>>> @@ -0,0 +1,103 @@
>>>> +package Mail::SpamAssassin::Pyzor::Digest;
>>>> +
>>>> +# Copyright 2018 cPanel, LLC.
>>>> +# All rights reserved.
>>>> +# http://cpanel.net
>>>> +#
>>>> +# <@LICENSE>
>>>> +# Licensed to the Apache Software Foundation (ASF) under one or more
>>>> +# contributor license agreements.  See the NOTICE file distributed with
>>>> +# this work for additional information regarding copyright ownership.
>>>> +# The ASF licenses this file to you under the Apache License, Version 2.0
>>>> +# (the "License"); you may not use this file except in compliance with
>>>> +# the License.  You may obtain a copy of the License at:
>>>> +#
>>>> +#     http://www.apache.org/licenses/LICENSE-2.0
>>>> +#
>>>> +# Unless required by applicable law or agreed to in writing, software
>>>> +# distributed under the License is distributed on an "AS IS" BASIS,
>>>> +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>>>> +# See the License for the specific language governing permissions and
>>>> +# limitations under the License.
>>>> +# </...@LICENSE>
>>>> +#
>>>> +
>>>> +use strict;
>>>> +use warnings;
>>>> +
>>>> +=encoding utf-8
>>>> +
>>>> +=head1 NAME
>>>> +
>>>> +Mail::SpamAssassin::Pyzor::Digest
>>>> +
>>>> +=head1 SYNOPSIS
>>>> +
>>>> +    my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $mime_text );
>>>> +
>>>> +=head1 DESCRIPTION
>>>> +
>>>> +A reimplementation of L<https://github.com/SpamExperts/pyzor/blob/master/pyzor/digest.py>.
>>>> +
>>>> +=cut
>>>> +
>>>> +#----------------------------------------------------------------------
>>>> +
>>>> +use Email::MIME ();
>>>> +
>>>> +use Mail::SpamAssassin::Pyzor::Digest::Pieces ();
>>>> +use Digest::SHA qw(sha1_hex);
>>>> +
>>>> +our $VERSION = '0.03';
>>>> +
>>>> +#----------------------------------------------------------------------
>>>> +
>>>> +=head1 FUNCTIONS
>>>> +
>>>> +=head2 $hex = get( $MSG )
>>>> +
>>>> +This takes an email message in raw MIME text format (i.e., as saved in the
>>>> +standard mbox format) and returns the message???s Pyzor digest in lower-case
>>>> +hexadecimal.
>>>> +
>>>> +The output from this function should normally be identical to that of
>>>> +the C<pyzor> script???s C<digest> command. It is suitable for use in
>>>> +L<Mail::SpamAssassin::Pyzor::Client>???s request methods.
>>>> +
>>>> +=cut
>>>> +
>>>> +sub get {
>>>> +    my ($text) = @_;
>>>> +    return Digest::SHA::sha1_hex( ${ _get_predigest( $text ) } );
>>>> +}
>>>> +
>>>> +# NB: This is called from the test.
>>>> +sub _get_predigest {    ## no critic qw(RequireArgUnpacking)
>>>> +    my ($msg_text_sr) = @_;
>>>> +
>>>> +    my $parsed = Email::MIME->new($$msg_text_sr);
>>>> +
>>>> +    my @lines;
>>>> +
>>>> +    my $payloads_ar = Mail::SpamAssassin::Pyzor::Digest::Pieces::digest_payloads($parsed);
>>>> +
>>>> +    for my $payload (@$payloads_ar) {
>>>> +        my @p_lines = Mail::SpamAssassin::Pyzor::Digest::Pieces::splitlines($payload);
>>>> +        for my $line (@p_lines) {
>>>> +            Mail::SpamAssassin::Pyzor::Digest::Pieces::normalize($line);
>>>> +
>>>> +            next if !Mail::SpamAssassin::Pyzor::Digest::Pieces::should_handle_line($line);
>>>> +
>>>> +            # Make sure we have an octet string.
>>>> +            utf8::encode($line) if utf8::is_utf8($line);
>>>> +
>>>> +            push @lines, $line;
>>>> +        }
>>>> +    }
>>>> +
>>>> +    my $digest_sr = Mail::SpamAssassin::Pyzor::Digest::Pieces::assemble_lines( \@lines );
>>>> +
>>>> +    return $digest_sr;
>>>> +}
>>>> +
>>>> +1;
>>>> diff --git a/lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm b/lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm
>>>> new file mode 100644
>>>> index 0000000..522accd
>>>> --- /dev/null
>>>> +++ b/lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm
>>>> @@ -0,0 +1,301 @@
>>>> +package Mail::SpamAssassin::Pyzor::Digest::Pieces;
>>>> +
>>>> +# Copyright 2018 cPanel, LLC.
>>>> +# All rights reserved.
>>>> +# http://cpanel.net
>>>> +#
>>>> +# <@LICENSE>
>>>> +# Licensed to the Apache Software Foundation (ASF) under one or more
>>>> +# contributor license agreements.  See the NOTICE file distributed with
>>>> +# this work for additional information regarding copyright ownership.
>>>> +# The ASF licenses this file to you under the Apache License, Version 2.0
>>>> +# (the "License"); you may not use this file except in compliance with
>>>> +# the License.  You may obtain a copy of the License at:
>>>> +#
>>>> +#     http://www.apache.org/licenses/LICENSE-2.0
>>>> +#
>>>> +# Unless required by applicable law or agreed to in writing, software
>>>> +# distributed under the License is distributed on an "AS IS" BASIS,
>>>> +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>>>> +# See the License for the specific language governing permissions and
>>>> +# limitations under the License.
>>>> +# </...@LICENSE>
>>>> +#
>>>> +
>>>> +use strict;
>>>> +use warnings;
>>>> +
>>>> +=encoding utf-8
>>>> +
>>>> +=head1 NAME
>>>> +
>>>> +Mail::SpamAssassin::Pyzor::Digest::Pieces
>>>> +
>>>> +=head1 DESCRIPTION
>>>> +
>>>> +This module houses backend logic for L<Mail::SpamAssassin::Pyzor::Digest>.
>>>> +
>>>> +It reimplements logic found in pyzor???s F<digest.py> module
>>>> +(L<https://github.com/SpamExperts/pyzor/blob/master/pyzor/digest.py>).
>>>> +
>>>> +=cut
>>>> +
>>>> +#----------------------------------------------------------------------
>>>> +
>>>> +use Email::MIME::ContentType ();
>>>> +use Encode                   ();
>>>> +
>>>> +our $VERSION = '0.03';
>>>> +
>>>> +# each tuple is [ offset, length ]
>>>> +use constant _HASH_SPEC => ( [ 20, 3 ], [ 60, 3 ] );
>>>> +
>>>> +use constant {
>>>> +    _MIN_LINE_LENGTH => 8,
>>>> +
>>>> +    _ATOMIC_NUM_LINES => 4,
>>>> +};
>>>> +
>>>> +#----------------------------------------------------------------------
>>>> +
>>>> +=head1 FUNCTIONS
>>>> +
>>>> +=head2 $strings_ar = digest_payloads( $EMAIL_MIME )
>>>> +
>>>> +This imitates the corresponding object method in F<digest.py>.
>>>> +It returns a reference to an array of strings. Each string can be either
>>>> +a byte string or a character string (e.g., UTF-8 decoded).
>>>> +
>>>> +NB: RFC 2822 stipulates that message bodies should use CRLF
>>>> +line breaks, not plain LF (nor plain CR). L<Email::MIME::Encodings>
>>>> +will thus convert any plain CRs in a quoted-printable message
>>>> +body into CRLF. Python, though, doesn???t do this, so the output of
>>>> +our implementation of C<digest_payloads()> diverges from that of the Python
>>>> +original. It doesn???t ultimately make a difference since the line-ending
>>>> +whitespace gets trimmed regardless, but it???s necessary to factor in when
>>>> +comparing the output of our implementation with the Python output.
>>>> +
>>>> +=cut
>>>> +
>>>> +sub digest_payloads {
>>>> +    my ($parsed) = @_;
>>>> +
>>>> +    my @subparts = $parsed->subparts();
>>>> +
>>>> +    my @payloads;
>>>> +
>>>> +    if (@subparts) {
>>>> +        @payloads = map { @{ digest_payloads($_) } } $parsed->subparts();
>>>> +    }
>>>> +    else {
>>>> +        my ( $main_type, $subtype, $encoding, $encode_check ) = parse_content_type( $parsed->content_type() );
>>>> +
>>>> +        my $payload;
>>>> +
>>>> +        if ( $main_type eq 'text' ) {
>>>> +
>>>> +            # Decode transfer encoding, but leave us as a byte string.
>>>> +            # Note that this is where Email::MIME converts plain LF to CRLF.
>>>> +            $payload = $parsed->body();
>>>> +
>>>> +            # This does the actual character decoding (i.e., ???charset???).
>>>> +            $payload = Encode::decode( $encoding, $payload, $encode_check );
>>>> +
>>>> +            if ( $subtype eq 'html' ) {
>>>> +                require Mail::SpamAssassin::Pyzor::Digest::StripHtml;
>>>> +                $payload = Mail::SpamAssassin::Pyzor::Digest::StripHtml::strip($payload);
>>>> +            }
>>>> +        }
>>>> +        else {
>>>> +
>>>> +            # This does no decoding, even of, e.g., quoted-printable or base64.
>>>> +            $payload = $parsed->body_raw();
>>>> +        }
>>>> +
>>>> +        push @payloads, $payload;
>>>> +    }
>>>> +
>>>> +    return \@payloads;
>>>> +}
>>>> +
>>>> +#----------------------------------------------------------------------
>>>> +
>>>> +=head2 normalize( $STRING )
>>>> +
>>>> +This imitates the corresponding object method in F<digest.py>.
>>>> +It modifies C<$STRING> in-place.
>>>> +
>>>> +As with the original implementation, if C<$STRING> contains (decoded)
>>>> +Unicode characters, those characters will be parsed accordingly. So:
>>>> +
>>>> +    $str = "123\xc2\xa0";   # [ c2 a0 ] == \u00a0, non-breaking space
>>>> +
>>>> +    normalize($str);
>>>> +
>>>> +The above will leave C<$str> alone, but this:
>>>> +
>>>> +    utf8::decode($str);
>>>> +
>>>> +    normalize($str);
>>>> +
>>>> +??? will trim off the last two bytes from C<$str>.
>>>> +
>>>> +=cut
>>>> +
>>>> +sub normalize {    ## no critic qw( Subroutines::RequireArgUnpacking )
>>>> +
>>>> +    # NULs are bad, mm-kay?
>>>> +    $_[0] =~ tr<\0><>d;
>>>> +
>>>> +    # NB: Python???s \s without re.UNICODE is the same as Perl???s \s
>>>> +    # with the /a modifier.
>>>> +    #
>>>> +    # https://docs.python.org/2/library/re.html
>>>> +    # https://perldoc.perl.org/perlrecharclass.html#Backslash-sequences
>>>> +
>>>> +    # Python: re.compile(r'\S{10,}')
>>>> +    $_[0] =~ s<\S{10,}><>ag;
>>>> +
>>>> +    # Python: re.compile(r'\S+@\S+')
>>>> +    $_[0] =~ s<\S+ @ \S+><>agx;
>>>> +
>>>> +    # Python: re.compile(r'[a-z]+:\S+', re.IGNORECASE)
>>>> +    $_[0] =~ s<[a-zA-Z]+ : \S+><>agx;
>>>> +
>>>> +    # (from digest.py ???)
>>>> +    # Make sure we do the whitespace last because some of the previous
>>>> +    # patterns rely on whitespace.
>>>> +    $_[0] =~ tr< \x09-\x0d><>d;
>>>> +
>>>> +    # This is fun. digest.py???s normalize() does a non-UNICODE whitespace
>>>> +    # strip, then calls strip() on the string, which *will* strip Unicode
>>>> +    # whitespace from the ends.
>>>> +    $_[0] =~ s<\A\s+><>;
>>>> +    $_[0] =~ s<\s+\z><>;
>>>> +
>>>> +    return;
>>>> +}
>>>> +
>>>> +#----------------------------------------------------------------------
>>>> +
>>>> +=head2 $yn = should_handle_line( $STRING )
>>>> +
>>>> +This imitates the corresponding object method in F<digest.py>.
>>>> +It returns a boolean.
>>>> +
>>>> +=cut
>>>> +
>>>> +sub should_handle_line {
>>>> +    return $_[0] && length( $_[0] ) >= _MIN_LINE_LENGTH();
>>>> +}
>>>> +
>>>> +#----------------------------------------------------------------------
>>>> +
>>>> +=head2 $sr = assemble_lines( \@LINES )
>>>> +
>>>> +This assembles a string buffer out of @LINES. The string is the buffer
>>>> +of octets that will be hashed to produce the message digest.
>>>> +
>>>> +Each member of @LINES is expected to be an B<octet string>, not a
>>>> +character string.
>>>> +
>>>> +=cut
>>>> +
>>>> +sub assemble_lines {
>>>> +    my ($lines_ar) = @_;
>>>> +
>>>> +    if ( @$lines_ar <= _ATOMIC_NUM_LINES() ) {
>>>> +
>>>> +        # cf. handle_atomic() in digest.py
>>>> +        return \join( q<>, @$lines_ar );
>>>> +    }
>>>> +
>>>> +    #----------------------------------------------------------------------
>>>> +    # cf. handle_atomic() in digest.py
>>>> +
>>>> +    my $str = q<>;
>>>> +
>>>> +    for my $ofs_len ( _HASH_SPEC() ) {
>>>> +        my ( $offset, $length ) = @$ofs_len;
>>>> +
>>>> +        for my $i ( 0 .. ( $length - 1 ) ) {
>>>> +            my $idx = int( $offset * @$lines_ar / 100 ) + $i;
>>>> +
>>>> +            next if !defined $lines_ar->[$idx];
>>>> +
>>>> +            $str .= $lines_ar->[$idx];
>>>> +        }
>>>> +    }
>>>> +
>>>> +    return \$str;
>>>> +}
>>>> +
>>>> +#----------------------------------------------------------------------
>>>> +
>>>> +=head2 ($main, $sub, $encoding, $checkval) = parse_content_type( $CONTENT_TYPE )
>>>> +
>>>> +=cut
>>>> +
>>>> +use constant _QUOTED_PRINTABLE_NAMES => (
>>>> +    "quopri-codec",
>>>> +    "quopri",
>>>> +    "quoted-printable",
>>>> +    "quotedprintable",
>>>> +);
>>>> +
>>>> +# Make Encode::decode() ignore anything that doesn???t fit the
>>>> +# given encoding.
>>>> +use constant _encode_check_ignore => q<>;
>>>> +
>>>> +sub parse_content_type {
>>>> +    my ($content_type) = @_;
>>>> +
>>>> +    $Email::MIME::ContentType::STRICT_PARAMS = 0;
>>>> +    my $ct_parse = Email::MIME::ContentType::parse_content_type(
>>>> +        $content_type,
>>>> +    );
>>>> +
>>>> +    my $main = $ct_parse->{'type'}    || q<>;
>>>> +    my $sub  = $ct_parse->{'subtype'} || q<>;
>>>> +
>>>> +    my $encoding = $ct_parse->{'attributes'}{'charset'};
>>>> +
>>>> +    my $checkval;
>>>> +
>>>> +    if ($encoding) {
>>>> +
>>>> +        # Lower-case everything, convert underscore to dash, and remove NUL.
>>>> +        $encoding =~ tr<A-Z_\0><a-z->d;
>>>> +
>>>> +        # Apparently pyzor accommodates messages that put the transfer
>>>> +        # encoding in the Content-Type.
>>>> +        if ( grep { $_ eq $encoding } _QUOTED_PRINTABLE_NAMES() ) {
>>>> +            $checkval = Encode::FB_CROAK();
>>>> +        }
>>>> +    }
>>>> +    else {
>>>> +        $encoding = 'ascii';
>>>> +    }
>>>> +
>>>> +    # Match Python .decode()???s 'ignore' behavior
>>>> +    $checkval ||= \&_encode_check_ignore;
>>>> +
>>>> +    return ( $main, $sub, $encoding, $checkval );
>>>> +}
>>>> +
>>>> +#----------------------------------------------------------------------
>>>> +
>>>> +=head2 @lines = splitlines( $TEXT )
>>>> +
>>>> +Imitates C<str.splitlines()>. (cf. C<pydoc str>)
>>>> +
>>>> +Returns a plain list in list context. Returns the number of
>>>> +items to be returned in scalar context.
>>>> +
>>>> +=cut
>>>> +
>>>> +sub splitlines {
>>>> +    return split m<\r\n?|\n>, $_[0];
>>>> +}
>>>> +
>>>> +1;
>>>> diff --git a/lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm b/lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm
>>>> new file mode 100644
>>>> index 0000000..2617b4a
>>>> --- /dev/null
>>>> +++ b/lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm
>>>> @@ -0,0 +1,177 @@
>>>> +package Mail::SpamAssassin::Pyzor::Digest::StripHtml;
>>>> +
>>>> +# Copyright 2018 cPanel, LLC.
>>>> +# All rights reserved.
>>>> +# http://cpanel.net
>>>> +#
>>>> +# <@LICENSE>
>>>> +# Licensed to the Apache Software Foundation (ASF) under one or more
>>>> +# contributor license agreements.  See the NOTICE file distributed with
>>>> +# this work for additional information regarding copyright ownership.
>>>> +# The ASF licenses this file to you under the Apache License, Version 2.0
>>>> +# (the "License"); you may not use this file except in compliance with
>>>> +# the License.  You may obtain a copy of the License at:
>>>> +#
>>>> +#     http://www.apache.org/licenses/LICENSE-2.0
>>>> +#
>>>> +# Unless required by applicable law or agreed to in writing, software
>>>> +# distributed under the License is distributed on an "AS IS" BASIS,
>>>> +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>>>> +# See the License for the specific language governing permissions and
>>>> +# limitations under the License.
>>>> +# </...@LICENSE>
>>>> +#
>>>> +
>>>> +use strict;
>>>> +use warnings;
>>>> +
>>>> +=encoding utf-8
>>>> +
>>>> +=head1 NAME
>>>> +
>>>> +Mail::SpamAssassin::Pyzor::Digest::StripHtml
>>>> +
>>>> +=head1 SYNOPSIS
>>>> +
>>>> +    my $stripped = Mail::SpamAssassin::Pyzor::Digest::StripHtml::strip($html);
>>>> +
>>>> +=head1 DESCRIPTION
>>>> +
>>>> +This module attempts to duplicate pyzor???s HTML-stripping logic.
>>>> +
>>>> +=head1 ACCURACY
>>>> +
>>>> +This library cannot achieve 100%, bug-for-bug parity with pyzor
>>>> +because to do so would require duplicating Python???s own HTML parsing
>>>> +library. Since that library???s output has changed over time, and those
>>>> +changes in turn affect pyzor, it???s literally impossible to arrive at
>>>> +a single, fully-compatible reimplementation.
>>>> +
>>>> +That said, all known divergences between pyzor and this library involve
>>>> +invalid HTML as input.
>>>> +
>>>> +Please open bug reports for any divergences you identify, particularly
>>>> +if the input is valid HTML.
>>>> +
>>>> +=cut
>>>> +
>>>> +#----------------------------------------------------------------------
>>>> +
>>>> +use HTML::Parser ();
>>>> +
>>>> +our $VERSION = '0.03';
>>>> +
>>>> +#----------------------------------------------------------------------
>>>> +
>>>> +=head1 FUNCTIONS
>>>> +
>>>> +=head2 $stripped = strip( $HTML )
>>>> +
>>>> +Give it some HTML, and it???ll give back the stripped text.
>>>> +
>>>> +In B<general>, the stripping consists of removing tags as well as
>>>> +C<E<lt>scriptE<gt>> and C<E<lt>styleE<gt>> elements; however, it also
>>>> +removes HTML entities.
>>>> +
>>>> +This tries very hard to duplicate pyzor???s behavior with invalid HTML.
>>>> +
>>>> +=cut
>>>> +
>>>> +sub strip {
>>>> +    my ($html) = @_;
>>>> +
>>>> +    $html =~ s<\A\s+><>;
>>>> +    $html =~ s<\s+\z><>;
>>>> +
>>>> +    my $p = HTML::Parser->new( api_version => 3 );
>>>> +
>>>> +    my @pieces;
>>>> +
>>>> +    my $accumulate = 1;
>>>> +
>>>> +    $p->handler(
>>>> +        start => sub {
>>>> +            my ($tagname) = @_;
>>>> +
>>>> +            $accumulate = 0 if $tagname eq 'script';
>>>> +            $accumulate = 0 if $tagname eq 'style';
>>>> +
>>>> +            return;
>>>> +        },
>>>> +        'tagname',
>>>> +    );
>>>> +
>>>> +    $p->handler(
>>>> +        end => sub {
>>>> +            $accumulate = 1;
>>>> +            return;
>>>> +        }
>>>> +    );
>>>> +
>>>> +    $p->handler(
>>>> +        text => sub {
>>>> +            my ($copy) = @_;
>>>> +
>>>> +            return if !$accumulate;
>>>> +
>>>> +            # pyzor???s HTML parser discards HTML entities. On top of that,
>>>> +            # we need to match, as closely as possible, pyzor???s handling of
>>>> +            # invalid HTML entities ??? which is a function of Python???s
>>>> +            # standard HTML parsing library. This will probably never be
>>>> +            # fully compatible with the pyzor, but we can get it close.
>>>> +
>>>> +            # The original is:
>>>> +            #
>>>> +            #   re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
>>>> +            #
>>>> +            # The parsing loop then ???backs up??? one byte if the last
>>>> +            # character isn???t a ???;???. We use a look-ahead assertion to
>>>> +            # mimic that behavior.
>>>> +            $copy =~ s<\&\# (?:[0-9]+ | [xX][0-9a-fA-F]+) (?: ; | \z | (?=[^0-9a-fA-F]) )>< >gx;
>>>> +
>>>> +            # The original is:
>>>> +            #
>>>> +            #   re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
>>>> +            #
>>>> +            # We again use a look-ahead assertion to mimic Python.
>>>> +            $copy =~ s<\& [a-zA-Z] [-.a-zA-Z0-9]* (?: ; | \z | (?=[^a-zA-Z0-9]) )>< >gx;
>>>> +
>>>> +            # Python???s HTMLParser aborts its parsing loop when it encounters
>>>> +            # an invalid numeric reference.
>>>> +            $copy =~ s<\&\#
>>>> +                (?:
>>>> +                    [^0-9xX]        # anything but the expected first char
>>>> +                    |
>>>> +                    [0-9]+[a-fA-F]  # hex within decimal
>>>> +                    |
>>>> +                    [xX][^0-9a-fA-F]
>>>> +                )
>>>> +                (.*)
>>>> +            ><
>>>> +                ( -1 == index($1, ';') ) ? q<> : '&#'
>>>> +            >exs;
>>>> +
>>>> +            # Python???s HTMLParser treats invalid entities as incomplete
>>>> +            $copy =~ s<(\&\#?)><$1 >gx;
>>>> +
>>>> +            $copy =~ s<\A\s+><>;
>>>> +            $copy =~ s<\s+\z><>;
>>>> +
>>>> +            push @pieces, \$copy if length $copy;
>>>> +        },
>>>> +        'text,tagname',
>>>> +    );
>>>> +
>>>> +    $p->parse($html);
>>>> +    $p->eof();
>>>> +
>>>> +    my $payload = join( q< >, map { $$_ } @pieces );
>>>> +
>>>> +    # Convert all sequences of whitespace OTHER THAN non-breaking spaces to
>>>> +    # plain spaces.
>>>> +    $payload =~ s<[^\S\x{a0}]+>< >g;
>>>> +
>>>> +    return $payload;
>>>> +}
>>>> +
>>>> +1;
>>>> diff --git a/t/pyzor.t b/t/pyzor.t
>>>> index 891f38d..e4ef83f 100755
>>>> --- a/t/pyzor.t
>>>> +++ b/t/pyzor.t
>>>> @@ -3,12 +3,9 @@
>>>>   use lib '.'; use lib 't';
>>>>   use SATest; sa_t_init("pyzor");
>>>>   
>>>> -use constant HAS_PYZOR => eval { $_ = untaint_cmd("which pyzor"); chomp; -x };
>>>> -
>>>>   use Test::More;
>>>>   plan skip_all => "Net tests disabled" unless conf_bool('run_net_tests');
>>>> -plan skip_all => "Pyzor executable not found in path" unless HAS_PYZOR;
>>>> -plan tests => 8;
>>>> +plan tests => 5;
>>>>   
>>>>   diag('Note: Failures may not be an SpamAssassin bug, as Pyzor tests can fail due to problems with the Pyzor servers.');
>>>>   
>>>> @@ -30,7 +27,7 @@ tstprefs ("
>>>>   sarun ("-t < data/spam/pyzor", \&patterns_run_cb);
>>>>   ok_all_patterns();
>>>>   # Same with fork
>>>> -sarun ("--cf='pyzor_fork 1' -t < data/spam/pyzor", \&patterns_run_cb);
>>>> +sarun ("-t < data/spam/pyzor", \&patterns_run_cb);
>>>>   ok_all_patterns();
>>>>   
>>>>   #TESTING FOR HAM
>>>> @@ -44,7 +41,3 @@ ok_all_patterns();
>>>>   
>>>>   sarun ("-D pyzor -t < data/nice/001 2>&1", \&patterns_run_cb);
>>>>   ok_all_patterns();
>>>> -# same with fork
>>>> -sarun ("-D pyzor --cf='pyzor_fork 1' -t < data/nice/001 2>&1", \&patterns_run_cb);
>>>> -ok_all_patterns();
>>>> -
>>>
>
-- 
Kevin A. McGrail
KMcGrail@Apache.org

Member, Apache Software Foundation
Chair Emeritus Apache SpamAssassin Project
https://www.linkedin.com/in/kmcgrail - 703.798.0171


Re: new Pyzor implementation

Posted by Henrik K <he...@hege.li>.
If that's the case, I probably wouldn't have any objections.  Not sure if it
requires some Contributor License Agreement from cPanels part (maybe they
already have one), and I guess atleast a bug to make it official..  Sidney
or KAM can probably chime in on the admin side..


On Thu, Oct 14, 2021 at 04:32:53PM +0200, Giovanni Bechis wrote:
> Once committed, code will be no more developed by cPanel on CPAN
> and original code will be removed.
> 
> I can work to integrate old and new Pyzor versions.
> 
>  Giovanni
> 
> On Thu, Oct 14, 2021 at 05:27:16PM +0300, Henrik K wrote:
> > 
> > If it's developed by cPanel in CPAN, then it should not be committed to SA,
> > unless it's clearly donated to SpamAssassin and removed from CPAN.  Assuming
> > we have developer resources and will to take it aboard.
> > 
> > As it is, Plugin/Pyzor.pm should have an option to choose which one to use,
> > as it makes no sense to ditch support for the widely installed original
> > Pyzor.
> > 
> > 
> > On Thu, Oct 14, 2021 at 04:15:13PM +0200, Giovanni Bechis wrote:
> > > Hi,
> > > cPanel has developed a native Perl Pyzor implementation for SpamAssassin
> > > and a diff against SpamAssassin 4.0 follows.
> > > Atm I am using it in production on a small server, more tests and
> > > opinions are welcome.
> > > 
> > > Original cPanel code is at https://metacpan.org/pod/Mail::Pyzor.
> > > 
> > >  Cheers
> > >   Giovanni
> > > 
> > > diff --git a/MANIFEST b/MANIFEST
> > > index 25d0192..2d9588c 100644
> > > --- a/MANIFEST
> > > +++ b/MANIFEST
> > > @@ -126,6 +126,11 @@ lib/Mail/SpamAssassin/Plugin/WLBLEval.pm
> > >  lib/Mail/SpamAssassin/Plugin/WhiteListSubject.pm
> > >  lib/Mail/SpamAssassin/PluginHandler.pm
> > >  lib/Mail/SpamAssassin/Plugin/URILocalBL.pm
> > > +lib/Mail/SpamAssassin/Pyzor/Client.pm
> > > +lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm
> > > +lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm
> > > +lib/Mail/SpamAssassin/Pyzor/Digest.pm
> > > +lib/Mail/SpamAssassin/Pyzor.pm
> > >  lib/Mail/SpamAssassin/RegistryBoundaries.pm
> > >  lib/Mail/SpamAssassin/Reporter.pm
> > >  lib/Mail/SpamAssassin/SQLBasedAddrList.pm
> > > diff --git a/lib/Mail/SpamAssassin/Plugin/Pyzor.pm b/lib/Mail/SpamAssassin/Plugin/Pyzor.pm
> > > index 3efd4b4..e4c9c05 100644
> > > --- a/lib/Mail/SpamAssassin/Plugin/Pyzor.pm
> > > +++ b/lib/Mail/SpamAssassin/Plugin/Pyzor.pm
> > > @@ -36,17 +36,13 @@ package Mail::SpamAssassin::Plugin::Pyzor;
> > >  
> > >  use Mail::SpamAssassin::Plugin;
> > >  use Mail::SpamAssassin::Logger;
> > > -use Mail::SpamAssassin::Timeout;
> > > -use Mail::SpamAssassin::Util qw(untaint_var untaint_file_path
> > > -                                proc_status_ok exit_status_str);
> > > +use Mail::SpamAssassin::Util qw(untaint_var);
> > > +
> > >  use strict;
> > >  use warnings;
> > >  # use bytes;
> > >  use re 'taint';
> > >  
> > > -use Storable;
> > > -use POSIX qw(PIPE_BUF WNOHANG _exit);
> > > -
> > >  our @ISA = qw(Mail::SpamAssassin::Plugin);
> > >  
> > >  sub new {
> > > @@ -78,7 +74,7 @@ sub set_config {
> > >    my ($self, $conf) = @_;
> > >    my @cmds;
> > >  
> > > -=head1 USER OPTIONS
> > > +=head1 ADMINISTRATOR OPTIONS
> > >  
> > >  =over 4
> > >  
> > > @@ -95,22 +91,7 @@ Whether to use Pyzor, if it is available.
> > >      type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL
> > >    });
> > >  
> > > -=item pyzor_fork (0|1)		(default: 0)
> > > -
> > > -Instead of running Pyzor synchronously, fork separate process for it and
> > > -read the results in later (similar to async DNS lookups).  Increases
> > > -throughput.  Experimental.
> > > -
> > > -=cut
> > > -
> > > -  push(@cmds, {
> > > -    setting => 'pyzor_fork',
> > > -    is_admin => 1,
> > > -    default => 0,
> > > -    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC,
> > > -  });
> > > -
> > > -=item pyzor_count_min NUMBER	(default: 5)
> > > +=item pyzor_count_min NUMBER		(default: 5)
> > >  
> > >  This option sets how often a message's body checksum must have been
> > >  reported to the Pyzor server before SpamAssassin will consider the Pyzor
> > > @@ -128,54 +109,8 @@ set this to a relatively low value, e.g. C<5>.
> > >      type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
> > >    });
> > >  
> > > -  # Deprecated setting, the name makes no sense!
> > > -  push (@cmds, {
> > > -    setting => 'pyzor_max',
> > > -    is_admin => 1,
> > > -    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC,
> > > -    code => sub {
> > > -      my ($self, $key, $value, $line) = @_;
> > > -      warn("deprecated setting used, change pyzor_max to pyzor_count_min\n");
> > > -      if ($value !~ /^\d+$/) {
> > > -        return $Mail::SpamAssassin::Conf::INVALID_VALUE;
> > > -      }
> > > -      $self->{pyzor_count_min} = $value;
> > > -    }
> > > -  });
> > > -
> > > -=item pyzor_whitelist_min NUMBER	(default: 10)
> > > -
> > > -This option sets how often a message's body checksum must have been
> > > -whitelisted to the Pyzor server for SpamAssassin to consider ignoring the
> > > -result.  Final decision is made by pyzor_whitelist_factor.
> > > -
> > > -=cut
> > > -
> > > -  push (@cmds, {
> > > -    setting => 'pyzor_whitelist_min',
> > > -    is_admin => 1,
> > > -    default => 10,
> > > -    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
> > > -  });
> > > -
> > > -=item pyzor_whitelist_factor NUMBER	(default: 0.2)
> > > -
> > > -Ignore Pyzor result if REPORTCOUNT x NUMBER >= pyzor_whitelist_min.
> > > -For default setting this means: 50 reports requires 10 whitelistings.
> > > -
> > > -=cut
> > > -
> > > -  push (@cmds, {
> > > -    setting => 'pyzor_whitelist_factor',
> > > -    is_admin => 1,
> > > -    default => 0.2,
> > > -    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
> > > -  });
> > > -
> > >  =back
> > >  
> > > -=head1 ADMINISTRATOR OPTIONS
> > > -
> > >  =over 4
> > >  
> > >  =item pyzor_timeout n		(default: 5)
> > > @@ -210,478 +145,182 @@ removing one of them.
> > >      type => $Mail::SpamAssassin::Conf::CONF_TYPE_DURATION
> > >    });
> > >  
> > > -=item pyzor_options options
> > > +=item pyzor_whitelist_min NUMBER        (default: 10)
> > >  
> > > -Specify additional options to the pyzor(1) command. Please note that only
> > > -characters in the range [0-9A-Za-z =,._/-] are allowed for security reasons.
> > > +This option sets how often a message's body checksum must have been
> > > +whitelisted to the Pyzor server for SpamAssassin to consider ignoring the
> > > +result.  Final decision is made by pyzor_whitelist_factor.
> > >  
> > >  =cut
> > >  
> > >    push (@cmds, {
> > > -    setting => 'pyzor_options',
> > > +    setting => 'pyzor_whitelist_min',
> > >      is_admin => 1,
> > > -    default => '',
> > > -    type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING,
> > > -    code => sub {
> > > -      my ($self, $key, $value, $line) = @_;
> > > -      if ($value !~ m{^([0-9A-Za-z =,._/-]+)$}) {
> > > -	return $Mail::SpamAssassin::Conf::INVALID_VALUE;
> > > -      }
> > > -      $self->{pyzor_options} = $1;
> > > -    }
> > > +    default => 10,
> > > +    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
> > >    });
> > >  
> > > -=item pyzor_path STRING
> > > +=item pyzor_whitelist_factor NUMBER     (default: 0.2)
> > >  
> > > -This option tells SpamAssassin specifically where to find the C<pyzor>
> > > -client instead of relying on SpamAssassin to find it in the current
> > > -PATH.  Note that if I<taint mode> is enabled in the Perl interpreter,
> > > -you should use this, as the current PATH will have been cleared.
> > > +Ignore Pyzor result if REPORTCOUNT x NUMBER >= pyzor_whitelist_min.
> > > +For default setting this means: 50 reports requires 10 whitelistings.
> > >  
> > >  =cut
> > >  
> > >    push (@cmds, {
> > > -    setting => 'pyzor_path',
> > > +    setting => 'pyzor_whitelist_factor',
> > >      is_admin => 1,
> > > -    default => undef,
> > > -    type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING,
> > > -    code => sub {
> > > -      my ($self, $key, $value, $line) = @_;
> > > -      if (!defined $value || !length $value) {
> > > -	return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE;
> > > -      }
> > > -      $value = untaint_file_path($value);
> > > -      if (!-x $value) {
> > > -	info("config: pyzor_path \"$value\" isn't an executable");
> > > -	return $Mail::SpamAssassin::Conf::INVALID_VALUE;
> > > -      }
> > > -
> > > -      $self->{pyzor_path} = $value;
> > > -    }
> > > +    default => 0.2,
> > > +    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
> > >    });
> > >  
> > >    $conf->{parser}->register_commands(\@cmds);
> > >  }
> > >  
> > >  sub is_pyzor_available {
> > > -  my ($self) = @_;
> > > +    my ($self) = @_;
> > >  
> > > -  my $pyzor = $self->{main}->{conf}->{pyzor_path} ||
> > > -    Mail::SpamAssassin::Util::find_executable_in_env_path('pyzor');
> > > -
> > > -  unless ($pyzor && -x $pyzor) {
> > > -    dbg("pyzor: no pyzor executable found");
> > > -    $self->{pyzor_available} = 0;
> > > -    return 0;
> > > -  }
> > > -
> > > -  # remember any found pyzor
> > > -  $self->{main}->{conf}->{pyzor_path} = $pyzor;
> > > -
> > > -  dbg("pyzor: pyzor is available: $pyzor");
> > > -  return 1;
> > > +    local $@;
> > > +    eval {
> > > +        require Mail::SpamAssassin::Pyzor::Digest;
> > > +        require Mail::SpamAssassin::Pyzor::Client;
> > > +    };
> > > +    return $@ ? 0 : 1;
> > >  }
> > >  
> > > -sub finish_parsing_start {
> > > -  my ($self, $opts) = @_;
> > > +sub get_pyzor_interface {
> > > +  my ($self) = @_;
> > >  
> > > -  # If forking, hard adjust priority -100 to launch early
> > > -  # Find rulenames from eval_to_rule mappings
> > > -  if ($opts->{conf}->{pyzor_fork}) {
> > > -    foreach (@{$opts->{conf}->{eval_to_rule}->{check_pyzor}}) {
> > > -      dbg("pyzor: adjusting rule $_ priority to -100");
> > > -      $opts->{conf}->{priority}->{$_} = -100;
> > > -    }
> > > +  if (!$self->{main}->{conf}->{use_pyzor}) {
> > > +    dbg("pyzor: use_pyzor option not enabled, disabling Pyzor");
> > > +    $self->{pyzor_interface} = "disabled";
> > > +    $self->{pyzor_available} = 0;
> > > +  }
> > > +  elsif ($self->is_pyzor_available()) {
> > > +    $self->{pyzor_interface} = "pyzor";
> > > +    $self->{pyzor_available} = 1;
> > > +  }
> > > +  else {
> > > +    dbg("pyzor: no pyzor found, disabling Pyzor");
> > > +    $self->{pyzor_available} = 0;
> > >    }
> > >  }
> > >  
> > >  sub check_pyzor {
> > > -  my ($self, $pms, $full) = @_;
> > > -
> > > -  return 0 if !$self->{pyzor_available};
> > > -  return 0 if !$self->{main}->{conf}->{use_pyzor};
> > > -
> > > -  return 0 if $pms->{pyzor_running};
> > > -  $pms->{pyzor_running} = 1;
> > > -
> > > -  return 0 if !$self->is_pyzor_available();
> > > -
> > > -  my $timer = $self->{main}->time_method("check_pyzor");
> > > +  my ($self, $permsgstatus, $full) = @_;
> > >  
> > >    # initialize valid tags
> > > -  $pms->{tag_data}->{PYZOR} = '';
> > > -
> > > -  # create fulltext tmpfile now (before possible forking)
> > > -  $pms->{pyzor_tmpfile} = $pms->create_fulltext_tmpfile();
> > > -
> > > -  ## non-forking method
> > > -
> > > -  if (!$self->{main}->{conf}->{pyzor_fork}) {
> > > -    my @results = $self->pyzor_lookup($pms);
> > > -    return $self->_check_result($pms, \@results);
> > > -  }
> > > -
> > > -  ## forking method
> > > -
> > > -  $pms->{pyzor_rulename} = $pms->get_current_eval_rule_name();
> > > -  $pms->rule_pending($pms->{pyzor_rulename}); # mark async
> > > -
> > > -  # create socketpair for communication
> > > -  $pms->{pyzor_backchannel} = Mail::SpamAssassin::SubProcBackChannel->new();
> > > -  my $back_selector = '';
> > > -  $pms->{pyzor_backchannel}->set_selector(\$back_selector);
> > > -  eval {
> > > -    $pms->{pyzor_backchannel}->setup_backchannel_parent_pre_fork();
> > > -  } or do {
> > > -    dbg("pyzor: backchannel pre-setup failed: $@");
> > > -    delete $pms->{pyzor_backchannel};
> > > -    return 0;
> > > -  };
> > > +  $permsgstatus->{tag_data}->{PYZOR} = "";
> > >  
> > > -  my $pid = fork();
> > > -  if (!defined $pid) {
> > > -    info("pyzor: child fork failed: $!");
> > > -    delete $pms->{pyzor_backchannel};
> > > -    return 0;
> > > -  }
> > > -  if (!$pid) {
> > > -    $0 = "$0 (pyzor)";
> > > -    $SIG{CHLD} = 'DEFAULT';
> > > -    $SIG{PIPE} = 'IGNORE';
> > > -    $SIG{$_} = sub {
> > > -      eval { dbg("pyzor: child process $$ caught signal $_[0]"); };
> > > -      _exit(6);  # avoid END and destructor processing
> > > -      kill('KILL',$$);  # still kicking? die!
> > > -      } foreach qw(INT HUP TERM TSTP QUIT USR1 USR2);
> > > -    dbg("pyzor: child process $$ forked");
> > > -    $pms->{pyzor_backchannel}->setup_backchannel_child_post_fork();
> > > -    my @results = $self->pyzor_lookup($pms);
> > > -    my $backmsg;
> > > -    eval {
> > > -      $backmsg = Storable::freeze(\@results);
> > > -    };
> > > -    if ($@) {
> > > -      dbg("pyzor: child return value freeze failed: $@");
> > > -      _exit(0); # avoid END and destructor processing
> > > -    }
> > > -    if (!syswrite($pms->{pyzor_backchannel}->{parent}, $backmsg)) {
> > > -      dbg("pyzor: child backchannel write failed: $!");
> > > -    }
> > > -    _exit(0); # avoid END and destructor processing
> > > -  }
> > > -
> > > -  $pms->{pyzor_pid} = $pid;
> > > +  my $timer = $self->{main}->time_method("check_pyzor");
> > >  
> > > -  eval {
> > > -    $pms->{pyzor_backchannel}->setup_backchannel_parent_post_fork($pid);
> > > -  } or do {
> > > -    dbg("pyzor: backchannel post-setup failed: $@");
> > > -    delete $pms->{pyzor_backchannel};
> > > -    return 0;
> > > -  };
> > > +  $self->get_pyzor_interface();
> > > +  return 0 unless $self->{pyzor_available};
> > >  
> > > -  return 0;
> > > +  return $self->pyzor_lookup($permsgstatus, $full);
> > >  }
> > >  
> > >  sub pyzor_lookup {
> > > -  my ($self, $pms) = @_;
> > > -
> > > -  my $conf = $self->{main}->{conf};
> > > -  my $timeout = $conf->{pyzor_timeout};
> > > -
> > > -  # note: not really tainted, this came from system configuration file
> > > -  my $path = untaint_file_path($conf->{pyzor_path});
> > > -  my $opts = untaint_var($conf->{pyzor_options}) || '';
> > > -
> > > -  $pms->enter_helper_run_mode();
> > > -
> > > -  my $pid;
> > > -  my @resp;
> > > -  my $timer = Mail::SpamAssassin::Timeout->new(
> > > -           { secs => $timeout, deadline => $pms->{master_deadline} });
> > > -  my $err = $timer->run_and_catch(sub {
> > > -    local $SIG{PIPE} = sub { die "__brokenpipe__ignore__\n" };
> > > -
> > > -    dbg("pyzor: opening pipe: ".
> > > -      join(' ', $path, $opts, "check", "<".$pms->{pyzor_tmpfile}));
> > > -
> > > -    $pid = Mail::SpamAssassin::Util::helper_app_pipe_open(*PYZOR,
> > > -	$pms->{pyzor_tmpfile}, 1, $path, split(' ', $opts), "check");
> > > -    $pid or die "$!\n";
> > > -
> > > -    # read+split avoids a Perl I/O bug (Bug 5985)
> > > -    my($inbuf, $nread);
> > > -    my $resp = '';
> > > -    while ($nread = read(PYZOR, $inbuf, 8192)) { $resp .= $inbuf }
> > > -    defined $nread  or die "error reading from pipe: $!";
> > > -    @resp = split(/^/m, $resp, -1);
> > > -
> > > -    my $errno = 0;
> > > -    close PYZOR or $errno = $!;
> > > -    if (proc_status_ok($?, $errno)) {
> > > -      dbg("pyzor: [%s] finished successfully", $pid);
> > > -    } elsif (proc_status_ok($?, $errno, 0, 1)) {  # sometimes it exits with 1
> > > -      dbg("pyzor: [%s] finished: %s", $pid, exit_status_str($?, $errno));
> > > -    } else {
> > > -      info("pyzor: [%s] error: %s", $pid, exit_status_str($?, $errno));
> > > -    }
> > > -
> > > -  });
> > > -
> > > -  if (defined(fileno(*PYZOR))) {  # still open
> > > -    if ($pid) {
> > > -      if (kill('TERM', $pid)) {
> > > -        dbg("pyzor: killed stale helper [$pid]");
> > > -      } else {
> > > -        dbg("pyzor: killing helper application [$pid] failed: $!");
> > > -      }
> > > -    }
> > > -    my $errno = 0;
> > > -    close PYZOR or $errno = $!;
> > > -    proc_status_ok($?, $errno)
> > > -      or info("pyzor: [%s] error: %s", $pid, exit_status_str($?, $errno));
> > > -  }
> > > -
> > > -  $pms->leave_helper_run_mode();
> > > -
> > > -  if ($timer->timed_out()) {
> > > -    dbg("pyzor: check timed out after $timeout seconds");
> > > -    return ();
> > > -  } elsif ($err) {
> > > -    chomp $err;
> > > -    info("pyzor: check failed: $err");
> > > -    return ();
> > > -  }
> > > -
> > > -  return @resp;
> > > -}
> > > -
> > > -sub check_tick {
> > > -  my ($self, $opts) = @_;
> > > -  $self->_check_forked_result($opts->{permsgstatus}, 0);
> > > -}
> > > -
> > > -sub check_cleanup {
> > > -  my ($self, $opts) = @_;
> > > -  $self->_check_forked_result($opts->{permsgstatus}, 1);
> > > -}
> > > -
> > > -sub _check_forked_result {
> > > -  my ($self, $pms, $finish) = @_;
> > > -
> > > -  return 0 if !$pms->{pyzor_backchannel};
> > > -  return 0 if !$pms->{pyzor_pid};
> > > +    my ( $self, $permsgstatus, $fulltext ) = @_;
> > > +    my $conf = $self->{main}->{conf};
> > > +    my $timeout = $conf->{pyzor_timeout};
> > > +
> > > +    my $client = ( $self->{'_pyzor_client'} ||= Mail::SpamAssassin::Pyzor::Client->new( 'timeout' => $timeout ) );
> > > +    my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $fulltext );
> > > +
> > > +    local $@;
> > > +    my $ref = eval { $client->check($digest); };
> > > +    dbg("pyzor: got response: $client->{'_server_host'}");
> > > +    # $client reply must be an hash
> > > +    return 0 if (not (ref $ref eq ref {}));
> > > +    if ($@) {
> > > +        my $err = $@;
> > >  
> > > -  my $timer = $self->{main}->time_method("check_pyzor");
> > > +        $err = eval { $err->get_message() } || $err;
> > >  
> > > -  $pms->{pyzor_abort} = $pms->{deadline_exceeded} || $pms->{shortcircuited};
> > > -
> > > -  my $kid_pid = $pms->{pyzor_pid};
> > > -  # if $finish, force waiting for the child
> > > -  my $pid = waitpid($kid_pid, $finish && !$pms->{pyzor_abort} ? 0 : WNOHANG);
> > > -  if ($pid == 0) {
> > > -    #dbg("pyzor: child process $kid_pid not finished yet, trying later");
> > > -    if ($pms->{pyzor_abort}) {
> > > -      dbg("pyzor: bailing out due to deadline/shortcircuit");
> > > -      kill('TERM', $kid_pid);
> > > -      if (waitpid($kid_pid, WNOHANG) == 0) {
> > > -        sleep(1);
> > > -        if (waitpid($kid_pid, WNOHANG) == 0) {
> > > -          dbg("pyzor: child process $kid_pid still alive, KILL");
> > > -          kill('KILL', $kid_pid);
> > > -          waitpid($kid_pid, 0);
> > > +        warn("pyzor: check failed: $err\n");
> > > +        return 0;
> > > +    } elsif ( defined $ref->{'Code'} and $ref->{'Code'} ne 200 ) {
> > > +        if(defined $ref->{'Code'} and defined $ref->{'Diag'}) {
> > > +          dbg("pyzor: check failed with invalid code: $ref->{'Code'}: $ref->{'Diag'}");
> > > +        } else {
> > > +          dbg("pyzor: check failed with undefined code");
> > >          }
> > > -      }
> > > -      delete $pms->{pyzor_pid};
> > > -      delete $pms->{pyzor_backchannel};
> > > +        return 0;
> > >      }
> > > -    return 0;
> > > -  } elsif ($pid == -1) {
> > > -    # child does not exist?
> > > -    dbg("pyzor: child process $kid_pid already handled?");
> > > -    delete $pms->{pyzor_backchannel};
> > > -    return 0;
> > > -  }
> > >  
> > > -  $pms->rule_ready($pms->{pyzor_rulename}); # mark rule ready for metas
> > > +    my $pyzor_count       = untaint_var($ref->{'Count'}) + 0;
> > > +    my $pyzor_whitelisted = untaint_var($ref->{'WL-Count'}) + 0;
> > > +    my $count_min = $conf->{pyzor_count_min};
> > > +    my $wl_min = $conf->{pyzor_whitelist_min};
> > >  
> > > -  dbg("pyzor: child process $kid_pid finished, reading results");
> > > +    my $wl_limit = $pyzor_whitelisted >= $wl_min ?
> > > +      $pyzor_count * $conf->{pyzor_whitelist_factor} : 0;
> > >  
> > > -  my $backmsg;
> > > -  my $ret = sysread($pms->{pyzor_backchannel}->{latest_kid_fh}, $backmsg, PIPE_BUF);
> > > -  if (!defined $ret || $ret == 0) {
> > > -    dbg("pyzor: could not read result from child: ".($ret == 0 ? 0 : $!));
> > > -    delete $pms->{pyzor_backchannel};
> > > -    return 0;
> > > -  }
> > > -
> > > -  delete $pms->{pyzor_backchannel};
> > > +    $permsgstatus->set_tag('PYZOR', "Reported $pyzor_count times, whitelisted $pyzor_whitelisted times.");
> > >  
> > > -  my $results;
> > > -  eval {
> > > -    $results = Storable::thaw($backmsg);
> > > -  };
> > > -  if ($@) {
> > > -    dbg("pyzor: child return value thaw failed: $@");
> > > -    return;
> > > -  }
> > > -
> > > -  $self->_check_result($pms, $results);
> > > -}
> > > +    dbg("pyzor: result: COUNT=$pyzor_count/$count_min WHITELIST=$pyzor_whitelisted/$wl_min/%.1f",
> > > +      $wl_limit);
> > >  
> > > -sub _check_result {
> > > -  my ($self, $pms, $results) = @_;
> > > -
> > > -  if (!@$results) {
> > > -    dbg("pyzor: no response from server");
> > > -    return 0;
> > > -  }
> > > -
> > > -  my $count = 0;
> > > -  my $count_wl = 0;
> > > -  foreach my $res (@$results) {
> > > -    chomp($res);
> > > -    if ($res =~ /^Traceback/) {
> > > -      info("pyzor: internal error, python traceback seen in response: $res");
> > > +    # Empty body etc results in same hash, we should skip very large numbers..
> > > +    if ($pyzor_count >= 1000000 || $pyzor_whitelisted >= 10000) {
> > > +      dbg("pyzor: result exceeded hardcoded limits, ignoring: count/wl 1000000/10000");
> > >        return 0;
> > >      }
> > > -    dbg("pyzor: got response: $res");
> > > -    # this regexp is intended to be a little bit forgiving
> > > -    if ($res =~ /^\S+\t.*?\t(\d+)\t(\d+)\s*$/) {
> > > -      # until pyzor servers can sync their DBs,
> > > -      # sum counts obtained from all servers
> > > -      $count += untaint_var($1)+0; # crazy but needs untainting
> > > -      $count_wl += untaint_var($2)+0;
> > > -    } else {
> > > -      # warn on failures to parse
> > > -      info("pyzor: failure to parse response \"$res\"");
> > > -    }
> > > -  }
> > > -
> > > -  my $conf = $self->{main}->{conf};
> > > -
> > > -  my $count_min = $conf->{pyzor_count_min};
> > > -  my $wl_min = $conf->{pyzor_whitelist_min};
> > >  
> > > -  my $wl_limit = $count_wl >= $wl_min ?
> > > -    $count * $conf->{pyzor_whitelist_factor} : 0;
> > > -
> > > -  dbg("pyzor: result: COUNT=$count/$count_min WHITELIST=$count_wl/$wl_min/%.1f",
> > > -    $wl_limit);
> > > -  $pms->set_tag('PYZOR', "Reported $count times, whitelisted $count_wl times.");
> > > -
> > > -  # Empty body etc results in same hash, we should skip very large numbers..
> > > -  if ($count >= 1000000 || $count_wl >= 10000) {
> > > -    dbg("pyzor: result exceeded hardcoded limits, ignoring: count/wl 1000000/10000");
> > > -    return 0;
> > > -  }
> > > -
> > > -  # Whitelisted?
> > > -  if ($wl_limit && $count_wl >= $wl_limit) {
> > > -    dbg("pyzor: message whitelisted");
> > > -    return 0;
> > > -  }
> > > +    # Whitelisted?
> > > +    if ($wl_limit && $pyzor_whitelisted >= $wl_limit) {
> > > +      dbg("pyzor: message whitelisted");
> > > +      return 0;
> > > +    }
> > >  
> > > -  if ($count >= $count_min) {
> > > -    if ($conf->{pyzor_fork}) {
> > > -      # forked needs to run got_hit()
> > > -      $pms->got_hit($pms->{pyzor_rulename}, "", ruletype => 'eval');
> > > +    if ( $pyzor_count >= $count_min ) {
> > > +      return 1;
> > >      }
> > > -    return 1;
> > > -  }
> > >  
> > > -  return 0;
> > > +    return 0;
> > >  }
> > >  
> > >  sub plugin_report {
> > >    my ($self, $options) = @_;
> > >  
> > > -  return if !$self->{pyzor_available};
> > > -  return if !$self->{main}->{conf}->{use_pyzor};
> > > -  return if $options->{report}->{options}->{dont_report_to_pyzor};
> > > -  return if !$self->is_pyzor_available();
> > > -
> > > -  # use temporary file: open2() is unreliable due to buffering under spamd
> > > -  my $tmpf = $options->{report}->create_fulltext_tmpfile($options->{text});
> > > -  if ($self->pyzor_report($options, $tmpf)) {
> > > -    $options->{report}->{report_available} = 1;
> > > -    info("reporter: spam reported to Pyzor");
> > > -    $options->{report}->{report_return} = 1;
> > > -  }
> > > -  else {
> > > -    info("reporter: could not report spam to Pyzor");
> > > -  }
> > > -  $options->{report}->delete_fulltext_tmpfile($tmpf);
> > > +  return unless $self->{pyzor_available};
> > > +  return unless $self->{main}->{conf}->{use_pyzor};
> > >  
> > > -  return 1;
> > > +  if (!$options->{report}->{options}->{dont_report_to_pyzor} && $self->is_pyzor_available())
> > > +  {
> > > +    if ($self->pyzor_report($options)) {
> > > +      $options->{report}->{report_available} = 1;
> > > +      info("reporter: spam reported to Pyzor");
> > > +      $options->{report}->{report_return} = 1;
> > > +    }
> > > +    else {
> > > +      info("reporter: could not report spam to Pyzor");
> > > +    }
> > > +  }
> > >  }
> > >  
> > >  sub pyzor_report {
> > > -  my ($self, $options, $tmpf) = @_;
> > > -
> > > -  # note: not really tainted, this came from system configuration file
> > > -  my $path = untaint_file_path($options->{report}->{conf}->{pyzor_path});
> > > -  my $opts = untaint_var($options->{report}->{conf}->{pyzor_options}) || '';
> > > +    my ( $self, $options ) = @_;
> > >  
> > > -  my $timeout = $self->{main}->{conf}->{pyzor_timeout};
> > > +    my $timeout = $self->{main}->{conf}->{pyzor_timeout};
> > >  
> > > -  $options->{report}->enter_helper_run_mode();
> > > +    my $client = ( $self->{'_pyzor_client'} ||= Mail::SpamAssassin::Pyzor::Client->new( 'timeout' => $timeout ) );
> > >  
> > > -  my $timer = Mail::SpamAssassin::Timeout->new({ secs => $timeout });
> > > -  my $err = $timer->run_and_catch(sub {
> > > +    my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $options->{'text'} );
> > >  
> > > -    local $SIG{PIPE} = sub { die "__brokenpipe__ignore__\n" };
> > > -
> > > -    dbg("pyzor: opening pipe: " . join(' ', $path, $opts, "report", "< $tmpf"));
> > > -
> > > -    my $pid = Mail::SpamAssassin::Util::helper_app_pipe_open(*PYZOR,
> > > -	$tmpf, 1, $path, split(' ', $opts), "report");
> > > -    $pid or die "$!\n";
> > > -
> > > -    my($inbuf,$nread,$nread_all); $nread_all = 0;
> > > -    # response is ignored, just check its existence
> > > -    while ( $nread=read(PYZOR,$inbuf,8192) ) { $nread_all += $nread }
> > > -    defined $nread  or die "error reading from pipe: $!";
> > > -
> > > -    dbg("pyzor: empty response")  if $nread_all < 1;
> > > -
> > > -    my $errno = 0;  close PYZOR or $errno = $!;
> > > -    # closing a pipe also waits for the process executing on the pipe to
> > > -    # complete, no need to explicitly call waitpid
> > > -    # my $child_stat = waitpid($pid,0) > 0 ? $? : undef;
> > > -    if (proc_status_ok($?,$errno, 0)) {
> > > -      dbg("pyzor: [%s] reporter finished successfully", $pid);
> > > -    } else {
> > > -      info("pyzor: [%s] reporter error: %s", $pid, exit_status_str($?,$errno));
> > > +    local $@;
> > > +    my $ref = eval { $client->report($digest); };
> > > +    if ($@) {
> > > +        warn("pyzor: report failed: $@");
> > > +        return 0;
> > >      }
> > > -
> > > -  });
> > > -
> > > -  $options->{report}->leave_helper_run_mode();
> > > -
> > > -  if ($timer->timed_out()) {
> > > -    dbg("reporter: pyzor report timed out after $timeout seconds");
> > > -    return 0;
> > > -  }
> > > -
> > > -  if ($err) {
> > > -    chomp $err;
> > > -    if ($err eq '__brokenpipe__ignore__') {
> > > -      dbg("reporter: pyzor report failed: broken pipe");
> > > -    } else {
> > > -      warn("reporter: pyzor report failed: $err\n");
> > > +    elsif ( $ref->{'Code'} ne 200 ) {
> > > +        dbg("pyzor: report failed with invalid code: $ref->{'Code'}: $ref->{'Diag'}");
> > > +        return 0;
> > >      }
> > > -    return 0;
> > > -  }
> > >  
> > > -  return 1;
> > > +    return 1;
> > >  }
> > >  
> > > -# Version features
> > > -sub has_fork { 1 }
> > > -
> > >  1;
> > > -
> > > -=back
> > > -
> > > -=cut
> > > diff --git a/lib/Mail/SpamAssassin/Pyzor.pm b/lib/Mail/SpamAssassin/Pyzor.pm
> > > new file mode 100644
> > > index 0000000..8ac27f4
> > > --- /dev/null
> > > +++ b/lib/Mail/SpamAssassin/Pyzor.pm
> > > @@ -0,0 +1,56 @@
> > > +package Mail::SpamAssassin::Pyzor;
> > > +
> > > +# Copyright 2018 cPanel, LLC.
> > > +# All rights reserved.
> > > +# http://cpanel.net
> > > +#
> > > +# <@LICENSE>
> > > +# Licensed to the Apache Software Foundation (ASF) under one or more
> > > +# contributor license agreements.  See the NOTICE file distributed with
> > > +# this work for additional information regarding copyright ownership.
> > > +# The ASF licenses this file to you under the Apache License, Version 2.0
> > > +# (the "License"); you may not use this file except in compliance with
> > > +# the License.  You may obtain a copy of the License at:
> > > +#
> > > +#     http://www.apache.org/licenses/LICENSE-2.0
> > > +#
> > > +# Unless required by applicable law or agreed to in writing, software
> > > +# distributed under the License is distributed on an "AS IS" BASIS,
> > > +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> > > +# See the License for the specific language governing permissions and
> > > +# limitations under the License.
> > > +# </...@LICENSE>
> > > +#
> > > +
> > > +use strict;
> > > +use warnings;
> > > +
> > > +our $VERSION = '0.06_01';
> > > +
> > > +=encoding utf-8
> > > +
> > > +=head1 NAME
> > > +
> > > +Mail::SpamAssassin::Pyzor - Pyzor spam filtering in Perl
> > > +
> > > +=head1 DESCRIPTION
> > > +
> > > +This distribution contains Perl implementations of parts of
> > > +L<Pyzor|http://pyzor.org>, a tool for use in spam email filtering.
> > > +It is intended for use with L<Mail::SpamAssassin> but may be useful
> > > +in other contexts.
> > > +
> > > +See the following modules for information on specific tools that
> > > +the distribution includes:
> > > +
> > > +=over
> > > +
> > > +=item * L<Mail::SpamAssassin::Pyzor::Client>
> > > +
> > > +=item * L<Mail::SpamAssassin::Pyzor::Digest>
> > > +
> > > +=back
> > > +
> > > +=cut
> > > +
> > > +1;
> > > diff --git a/lib/Mail/SpamAssassin/Pyzor/Client.pm b/lib/Mail/SpamAssassin/Pyzor/Client.pm
> > > new file mode 100644
> > > index 0000000..ccff868
> > > --- /dev/null
> > > +++ b/lib/Mail/SpamAssassin/Pyzor/Client.pm
> > > @@ -0,0 +1,415 @@
> > > +package Mail::SpamAssassin::Pyzor::Client;
> > > +
> > > +# Copyright 2018 cPanel, LLC.
> > > +# All rights reserved.
> > > +# http://cpanel.net
> > > +#
> > > +# <@LICENSE>
> > > +# Licensed to the Apache Software Foundation (ASF) under one or more
> > > +# contributor license agreements.  See the NOTICE file distributed with
> > > +# this work for additional information regarding copyright ownership.
> > > +# The ASF licenses this file to you under the Apache License, Version 2.0
> > > +# (the "License"); you may not use this file except in compliance with
> > > +# the License.  You may obtain a copy of the License at:
> > > +#
> > > +#     http://www.apache.org/licenses/LICENSE-2.0
> > > +#
> > > +# Unless required by applicable law or agreed to in writing, software
> > > +# distributed under the License is distributed on an "AS IS" BASIS,
> > > +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> > > +# See the License for the specific language governing permissions and
> > > +# limitations under the License.
> > > +# </...@LICENSE>
> > > +#
> > > +
> > > +use strict;
> > > +use warnings;
> > > +
> > > +=encoding utf-8
> > > +
> > > +=head1 NAME
> > > +
> > > +Mail::SpamAssassin::Pyzor::Client - Pyzor client logic
> > > +
> > > +=head1 SYNOPSIS
> > > +
> > > +    use Mail::SpamAssassin::Pyzor::Client ();
> > > +    use Mail::SpamAssassin::Pyzor::Digest ();
> > > +
> > > +    my $client = Mail::SpamAssassin::Pyzor::Client->new();
> > > +
> > > +    my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $msg );
> > > +
> > > +    my $check_ref = $client->check($digest);
> > > +    die $check_ref->{'Diag'} if $check_ref->{'Code'} ne '200';
> > > +
> > > +    my $report_ref = $client->report($digest);
> > > +    die $report_ref->{'Diag'} if $report_ref->{'Code'} ne '200';
> > > +
> > > +=head1 DESCRIPTION
> > > +
> > > +A bare-bones L<Pyzor|http://pyzor.org> client that currently only
> > > +implements the functionality needed for L<Mail::SpamAssassin>.
> > > +
> > > +=head1 PROTOCOL DETAILS
> > > +
> > > +The Pyzor protocol is not a published standard, and there appears to be
> > > +no meaningful public documentation. What follows is enough information,
> > > +largely gleaned through forum posts and reverse engineering, to facilitate
> > > +effective use of this module:
> > > +
> > > +Pyzor is an RPC-oriented, message-based protocol. Each message
> > > +is a simple dictionary of 7-bit ASCII keys and values. Server responses
> > > +always include at least the following:
> > > +
> > > +=over
> > > +
> > > +=item * C<Code> - Similar to HTTP status codes; anything besides C<200>
> > > +is an error.
> > > +
> > > +=item * C<Diag> - Similar to HTTP status reasons: a text description
> > > +of the status.
> > > +
> > > +=back
> > > +
> > > +(NB: There are additional standard response headers that are useful only for
> > > +the protocol itself and thus are not part of this module???s returns.)
> > > +
> > > +=head2 Reliability
> > > +
> > > +Pyzor uses UDP rather than TCP, so no message is guaranteed to reach its
> > > +destination. A transmission failure can happen in either the request or
> > > +the response; in either case, a timeout error will result. Such errors
> > > +are represented as thrown instances of L<Mail::Pyzor::X::Timeout>.
> > > +
> > > +=cut
> > > +
> > > +#----------------------------------------------------------------------
> > > +
> > > +our $VERSION = '0.04';
> > > +
> > > +our $DEFAULT_SERVER_HOST    = 'public.pyzor.org';
> > > +our $DEFAULT_SERVER_PORT    = 24441;
> > > +our $DEFAULT_USERNAME       = 'anonymous';
> > > +our $DEFAULT_PASSWORD       = '';
> > > +our $DEFAULT_OP_SPEC        = '20,3,60,3';
> > > +our $PYZOR_PROTOCOL_VERSION = 2.1;
> > > +our $DEFAULT_TIMEOUT        = 3.5;
> > > +our $READ_SIZE              = 8192;
> > > +
> > > +use IO::Socket::INET ();
> > > +use Digest::SHA qw(sha1 sha1_hex);
> > > +
> > > +my @hash_order = ( 'Op', 'Op-Digest', 'Op-Spec', 'Thread', 'PV', 'User', 'Time', 'Sig' );
> > > +
> > > +#----------------------------------------------------------------------
> > > +
> > > +=head1 CONSTRUCTOR
> > > +
> > > +=head2 new(%OPTS)
> > > +
> > > +Create a new pyzor client.
> > > +
> > > +=over 2
> > > +
> > > +=item Input
> > > +
> > > +%OPTS are (all optional):
> > > +
> > > +=over 3
> > > +
> > > +=item * C<server_host> - The pyzor server host to connect to (default is
> > > +C<public.pyzor.org>)
> > > +
> > > +=item * C<server_port> - The pyzor server port to connect to (default is
> > > +24441)
> > > +
> > > +=item * C<username> - The username to present to the pyzor server (default
> > > +is C<anonymous>)
> > > +
> > > +=item * C<password> - The password to present to the pyzor server (default
> > > +is empty)
> > > +
> > > +=item * C<timeout> - The maximum time, in seconds, to wait for a response
> > > +from the pyzor server (defeault is 3.5)
> > > +
> > > +=back
> > > +
> > > +=item Output
> > > +
> > > +=over 3
> > > +
> > > +Returns a L<Mail::SpamAssassin::Pyzor::Client> object.
> > > +
> > > +=back
> > > +
> > > +=back
> > > +
> > > +=cut
> > > +
> > > +sub new {
> > > +    my ( $class, %OPTS ) = @_;
> > > +
> > > +    return bless {
> > > +        '_server_host' => $OPTS{'server_host'} || $DEFAULT_SERVER_HOST,
> > > +        '_server_port' => $OPTS{'server_port'} || $DEFAULT_SERVER_PORT,
> > > +        '_username'    => $OPTS{'username'}    || $DEFAULT_USERNAME,
> > > +        '_password'    => $OPTS{'password'}    || $DEFAULT_PASSWORD,
> > > +        '_op_spec'     => $DEFAULT_OP_SPEC,
> > > +        '_timeout'     => $OPTS{'timeout'} || $DEFAULT_TIMEOUT,
> > > +    }, $class;
> > > +}
> > > +
> > > +#----------------------------------------------------------------------
> > > +
> > > +=head1 REQUEST METHODS
> > > +
> > > +=head2 report($digest)
> > > +
> > > +Report the digest of a spam message to the pyzor server. This function
> > > +will throw if a messaging failure or timeout happens.
> > > +
> > > +=over 2
> > > +
> > > +=item Input
> > > +
> > > +=over 3
> > > +
> > > +=item $digest C<SCALAR>
> > > +
> > > +The message digest to report, as given by
> > > +C<Mail::SpamAssassin::Pyzor::Digest::get()>.
> > > +
> > > +=back
> > > +
> > > +=item Output
> > > +
> > > +=over 3
> > > +
> > > +=item C<HASHREF>
> > > +
> > > +Returns a hashref of the standard attributes noted above.
> > > +
> > > +=back
> > > +
> > > +=back
> > > +
> > > +=cut
> > > +
> > > +sub report {
> > > +    my ( $self, $digest ) = @_;
> > > +
> > > +    my $msg_ref = $self->_get_base_msg( 'report', $digest );
> > > +
> > > +    $msg_ref->{'Op-Spec'} = $self->{'_op_spec'};
> > > +
> > > +    return $self->_send_receive_msg($msg_ref);
> > > +}
> > > +
> > > +=head2 check($digest)
> > > +
> > > +Check the digest of a message to see if
> > > +the pyzor server has a report for it. This function
> > > +will throw if a messaging failure or timeout happens.
> > > +
> > > +=over 2
> > > +
> > > +=item Input
> > > +
> > > +=over 3
> > > +
> > > +=item $digest C<SCALAR>
> > > +
> > > +The message digest to check, as given by
> > > +C<Mail::SpamAssassin::Pyzor::Digest::get()>.
> > > +
> > > +=back
> > > +
> > > +=item Output
> > > +
> > > +=over 3
> > > +
> > > +=item C<HASHREF>
> > > +
> > > +Returns a hashref of the standard attributes noted above
> > > +as well as the following:
> > > +
> > > +=over
> > > +
> > > +=item * C<Count> - The number of reports the server has received
> > > +for the given digest.
> > > +
> > > +=item * C<WL-Count> - The number of whitelist requests the server has received
> > > +for the given digest.
> > > +
> > > +=back
> > > +
> > > +=back
> > > +
> > > +=back
> > > +
> > > +=cut
> > > +
> > > +sub check {
> > > +    my ( $self, $digest ) = @_;
> > > +
> > > +    return $self->_send_receive_msg( $self->_get_base_msg( 'check', $digest ) );
> > > +}
> > > +
> > > +# ----------------------------------------
> > > +
> > > +sub _send_receive_msg {
> > > +    my ( $self, $msg_ref ) = @_;
> > > +
> > > +    my $thread_id = $msg_ref->{'Thread'} or warn 'No thread ID?';
> > > +
> > > +    $self->_sign_msg($msg_ref);
> > > +
> > > +    return $self->_do_send_receive(
> > > +        $self->_generate_packet_from_message($msg_ref) . "\n\n",
> > > +        $thread_id,
> > > +    );
> > > +}
> > > +
> > > +sub _get_base_msg {
> > > +    my ( $self, $op, $digest ) = @_;
> > > +
> > > +    die "Implementor error: op is required" if !$op;
> > > +    die "error: digest is required"         if !$digest;
> > > +
> > > +    return {
> > > +        'User'      => $self->{'_username'},
> > > +        'PV'        => $PYZOR_PROTOCOL_VERSION,
> > > +        'Time'      => time(),
> > > +        'Op'        => $op,
> > > +        'Op-Digest' => $digest,
> > > +        'Thread'    => $self->_generate_thread_id()
> > > +    };
> > > +}
> > > +
> > > +sub _do_send_receive {
> > > +    my ( $self, $packet, $thread_id ) = @_;
> > > +
> > > +    my $sock = $self->_get_connection_or_die();
> > > +
> > > +    $self->_send_packet( $sock, $packet );
> > > +    my $response = $self->_receive_packet( $sock, $thread_id );
> > > +
> > > +    return 0 if not defined $response;
> > > +
> > > +    my $resp_hr = { map { ( split(m{: }) )[ 0, 1 ] } split( m{\n}, $response ) };
> > > +
> > > +    delete $resp_hr->{'Thread'};
> > > +
> > > +    my $response_pv = delete $resp_hr->{'PV'};
> > > +
> > > +    if ( $PYZOR_PROTOCOL_VERSION ne $response_pv ) {
> > > +        warn "Unexpected protocol version ($response_pv) in Pyzor response!";
> > > +    }
> > > +
> > > +    return $resp_hr;
> > > +}
> > > +
> > > +sub _receive_packet {
> > > +    my ( $self, $sock, $thread_id ) = @_;
> > > +
> > > +    my $timeout = $self->{'_timeout'} * 1000;
> > > +
> > > +    my $end_time = time + $self->{'_timeout'};
> > > +
> > > +    $sock->blocking(0);
> > > +    my $response = '';
> > > +    my $rout     = '';
> > > +    my $rin      = '';
> > > +    vec( $rin, fileno($sock), 1 ) = 1;
> > > +
> > > +    while (1) {
> > > +        my $time_left = $end_time - time;
> > > +
> > > +        if ( $time_left <= 0 ) {
> > > +          warn("Did not receive a response from the pyzor server $self->{'_server_host'}:$self->{'_server_port'} for $self->{'_timeout'} seconds!");
> > > +          return;
> > > +        }
> > > +
> > > +        my $bytes = sysread( $sock, $response, $READ_SIZE, length $response );
> > > +        if ( !defined($bytes) && !$!{'EAGAIN'} && !$!{'EWOULDBLOCK'} ) {
> > > +            warn "read from socket: $!";
> > > +        }
> > > +
> > > +        if ( index( $response, "\n\n" ) > -1 ) {
> > > +
> > > +            # Reject the response unless its thread ID matches what we sent.
> > > +            # This prevents confusion among concurrent Pyzor reqeusts.
> > > +            if ( index( $response, "\nThread: $thread_id\n" ) != -1 ) {
> > > +                last;
> > > +            }
> > > +            else {
> > > +                $response = '';
> > > +            }
> > > +        }
> > > +
> > > +        my $found = select( $rout = $rin, undef, undef, $time_left );
> > > +        warn "select(): $!" if $found == -1;
> > > +    }
> > > +
> > > +    return $response;
> > > +}
> > > +
> > > +sub _send_packet {
> > > +    my ( $self, $sock, $packet ) = @_;
> > > +
> > > +    $sock->blocking(1);
> > > +    syswrite( $sock, $packet ) or warn "write to socket: $!";
> > > +
> > > +    return;
> > > +}
> > > +
> > > +sub _get_connection_or_die {
> > > +    my ($self) = @_;
> > > +
> > > +    # clear the socket if the PID changes
> > > +    if ( defined $self->{'_sock_pid'} && $self->{'_sock_pid'} != $$ ) {
> > > +        undef $self->{'_sock_pid'};
> > > +        undef $self->{'_sock'};
> > > +    }
> > > +
> > > +    $self->{'_sock_pid'} ||= $$;
> > > +    $self->{'_sock'}     ||= IO::Socket::INET->new(
> > > +        'PeerHost' => $self->{'_server_host'},
> > > +        'PeerPort' => $self->{'_server_port'},
> > > +        'Proto'    => 'udp'
> > > +    ) or die "Cannot connect to $self->{'_server_host'}:$self->{'_server_port'}: $@ $!";
> > > +
> > > +    return $self->{'_sock'};
> > > +}
> > > +
> > > +sub _sign_msg {
> > > +    my ( $self, $msg_ref ) = @_;
> > > +
> > > +    $msg_ref->{'Sig'} = lc Digest::SHA::sha1_hex(
> > > +        Digest::SHA::sha1( $self->_generate_packet_from_message($msg_ref) )
> > > +    );
> > > +
> > > +    return 1;
> > > +}
> > > +
> > > +sub _generate_packet_from_message {
> > > +    my ( $self, $msg_ref ) = @_;
> > > +
> > > +    return join( "\n", map { "$_: $msg_ref->{$_}" } grep { length $msg_ref->{$_} } @hash_order );
> > > +}
> > > +
> > > +sub _generate_thread_id {
> > > +    my $RAND_MAX = 2**16;
> > > +    my $val      = 0;
> > > +    $val = int rand($RAND_MAX) while $val < 1024;
> > > +    return $val;
> > > +}
> > > +
> > > +sub _get_user_pass_hash_key {
> > > +    my ($self) = @_;
> > > +
> > > +    return lc Digest::SHA::sha1_hex( $self->{'_username'} . ':' . $self->{'_password'} );
> > > +}
> > > +
> > > +1;
> > > diff --git a/lib/Mail/SpamAssassin/Pyzor/Digest.pm b/lib/Mail/SpamAssassin/Pyzor/Digest.pm
> > > new file mode 100644
> > > index 0000000..0e8a5ae
> > > --- /dev/null
> > > +++ b/lib/Mail/SpamAssassin/Pyzor/Digest.pm
> > > @@ -0,0 +1,103 @@
> > > +package Mail::SpamAssassin::Pyzor::Digest;
> > > +
> > > +# Copyright 2018 cPanel, LLC.
> > > +# All rights reserved.
> > > +# http://cpanel.net
> > > +#
> > > +# <@LICENSE>
> > > +# Licensed to the Apache Software Foundation (ASF) under one or more
> > > +# contributor license agreements.  See the NOTICE file distributed with
> > > +# this work for additional information regarding copyright ownership.
> > > +# The ASF licenses this file to you under the Apache License, Version 2.0
> > > +# (the "License"); you may not use this file except in compliance with
> > > +# the License.  You may obtain a copy of the License at:
> > > +#
> > > +#     http://www.apache.org/licenses/LICENSE-2.0
> > > +#
> > > +# Unless required by applicable law or agreed to in writing, software
> > > +# distributed under the License is distributed on an "AS IS" BASIS,
> > > +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> > > +# See the License for the specific language governing permissions and
> > > +# limitations under the License.
> > > +# </...@LICENSE>
> > > +#
> > > +
> > > +use strict;
> > > +use warnings;
> > > +
> > > +=encoding utf-8
> > > +
> > > +=head1 NAME
> > > +
> > > +Mail::SpamAssassin::Pyzor::Digest
> > > +
> > > +=head1 SYNOPSIS
> > > +
> > > +    my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $mime_text );
> > > +
> > > +=head1 DESCRIPTION
> > > +
> > > +A reimplementation of L<https://github.com/SpamExperts/pyzor/blob/master/pyzor/digest.py>.
> > > +
> > > +=cut
> > > +
> > > +#----------------------------------------------------------------------
> > > +
> > > +use Email::MIME ();
> > > +
> > > +use Mail::SpamAssassin::Pyzor::Digest::Pieces ();
> > > +use Digest::SHA qw(sha1_hex);
> > > +
> > > +our $VERSION = '0.03';
> > > +
> > > +#----------------------------------------------------------------------
> > > +
> > > +=head1 FUNCTIONS
> > > +
> > > +=head2 $hex = get( $MSG )
> > > +
> > > +This takes an email message in raw MIME text format (i.e., as saved in the
> > > +standard mbox format) and returns the message???s Pyzor digest in lower-case
> > > +hexadecimal.
> > > +
> > > +The output from this function should normally be identical to that of
> > > +the C<pyzor> script???s C<digest> command. It is suitable for use in
> > > +L<Mail::SpamAssassin::Pyzor::Client>???s request methods.
> > > +
> > > +=cut
> > > +
> > > +sub get {
> > > +    my ($text) = @_;
> > > +    return Digest::SHA::sha1_hex( ${ _get_predigest( $text ) } );
> > > +}
> > > +
> > > +# NB: This is called from the test.
> > > +sub _get_predigest {    ## no critic qw(RequireArgUnpacking)
> > > +    my ($msg_text_sr) = @_;
> > > +
> > > +    my $parsed = Email::MIME->new($$msg_text_sr);
> > > +
> > > +    my @lines;
> > > +
> > > +    my $payloads_ar = Mail::SpamAssassin::Pyzor::Digest::Pieces::digest_payloads($parsed);
> > > +
> > > +    for my $payload (@$payloads_ar) {
> > > +        my @p_lines = Mail::SpamAssassin::Pyzor::Digest::Pieces::splitlines($payload);
> > > +        for my $line (@p_lines) {
> > > +            Mail::SpamAssassin::Pyzor::Digest::Pieces::normalize($line);
> > > +
> > > +            next if !Mail::SpamAssassin::Pyzor::Digest::Pieces::should_handle_line($line);
> > > +
> > > +            # Make sure we have an octet string.
> > > +            utf8::encode($line) if utf8::is_utf8($line);
> > > +
> > > +            push @lines, $line;
> > > +        }
> > > +    }
> > > +
> > > +    my $digest_sr = Mail::SpamAssassin::Pyzor::Digest::Pieces::assemble_lines( \@lines );
> > > +
> > > +    return $digest_sr;
> > > +}
> > > +
> > > +1;
> > > diff --git a/lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm b/lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm
> > > new file mode 100644
> > > index 0000000..522accd
> > > --- /dev/null
> > > +++ b/lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm
> > > @@ -0,0 +1,301 @@
> > > +package Mail::SpamAssassin::Pyzor::Digest::Pieces;
> > > +
> > > +# Copyright 2018 cPanel, LLC.
> > > +# All rights reserved.
> > > +# http://cpanel.net
> > > +#
> > > +# <@LICENSE>
> > > +# Licensed to the Apache Software Foundation (ASF) under one or more
> > > +# contributor license agreements.  See the NOTICE file distributed with
> > > +# this work for additional information regarding copyright ownership.
> > > +# The ASF licenses this file to you under the Apache License, Version 2.0
> > > +# (the "License"); you may not use this file except in compliance with
> > > +# the License.  You may obtain a copy of the License at:
> > > +#
> > > +#     http://www.apache.org/licenses/LICENSE-2.0
> > > +#
> > > +# Unless required by applicable law or agreed to in writing, software
> > > +# distributed under the License is distributed on an "AS IS" BASIS,
> > > +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> > > +# See the License for the specific language governing permissions and
> > > +# limitations under the License.
> > > +# </...@LICENSE>
> > > +#
> > > +
> > > +use strict;
> > > +use warnings;
> > > +
> > > +=encoding utf-8
> > > +
> > > +=head1 NAME
> > > +
> > > +Mail::SpamAssassin::Pyzor::Digest::Pieces
> > > +
> > > +=head1 DESCRIPTION
> > > +
> > > +This module houses backend logic for L<Mail::SpamAssassin::Pyzor::Digest>.
> > > +
> > > +It reimplements logic found in pyzor???s F<digest.py> module
> > > +(L<https://github.com/SpamExperts/pyzor/blob/master/pyzor/digest.py>).
> > > +
> > > +=cut
> > > +
> > > +#----------------------------------------------------------------------
> > > +
> > > +use Email::MIME::ContentType ();
> > > +use Encode                   ();
> > > +
> > > +our $VERSION = '0.03';
> > > +
> > > +# each tuple is [ offset, length ]
> > > +use constant _HASH_SPEC => ( [ 20, 3 ], [ 60, 3 ] );
> > > +
> > > +use constant {
> > > +    _MIN_LINE_LENGTH => 8,
> > > +
> > > +    _ATOMIC_NUM_LINES => 4,
> > > +};
> > > +
> > > +#----------------------------------------------------------------------
> > > +
> > > +=head1 FUNCTIONS
> > > +
> > > +=head2 $strings_ar = digest_payloads( $EMAIL_MIME )
> > > +
> > > +This imitates the corresponding object method in F<digest.py>.
> > > +It returns a reference to an array of strings. Each string can be either
> > > +a byte string or a character string (e.g., UTF-8 decoded).
> > > +
> > > +NB: RFC 2822 stipulates that message bodies should use CRLF
> > > +line breaks, not plain LF (nor plain CR). L<Email::MIME::Encodings>
> > > +will thus convert any plain CRs in a quoted-printable message
> > > +body into CRLF. Python, though, doesn???t do this, so the output of
> > > +our implementation of C<digest_payloads()> diverges from that of the Python
> > > +original. It doesn???t ultimately make a difference since the line-ending
> > > +whitespace gets trimmed regardless, but it???s necessary to factor in when
> > > +comparing the output of our implementation with the Python output.
> > > +
> > > +=cut
> > > +
> > > +sub digest_payloads {
> > > +    my ($parsed) = @_;
> > > +
> > > +    my @subparts = $parsed->subparts();
> > > +
> > > +    my @payloads;
> > > +
> > > +    if (@subparts) {
> > > +        @payloads = map { @{ digest_payloads($_) } } $parsed->subparts();
> > > +    }
> > > +    else {
> > > +        my ( $main_type, $subtype, $encoding, $encode_check ) = parse_content_type( $parsed->content_type() );
> > > +
> > > +        my $payload;
> > > +
> > > +        if ( $main_type eq 'text' ) {
> > > +
> > > +            # Decode transfer encoding, but leave us as a byte string.
> > > +            # Note that this is where Email::MIME converts plain LF to CRLF.
> > > +            $payload = $parsed->body();
> > > +
> > > +            # This does the actual character decoding (i.e., ???charset???).
> > > +            $payload = Encode::decode( $encoding, $payload, $encode_check );
> > > +
> > > +            if ( $subtype eq 'html' ) {
> > > +                require Mail::SpamAssassin::Pyzor::Digest::StripHtml;
> > > +                $payload = Mail::SpamAssassin::Pyzor::Digest::StripHtml::strip($payload);
> > > +            }
> > > +        }
> > > +        else {
> > > +
> > > +            # This does no decoding, even of, e.g., quoted-printable or base64.
> > > +            $payload = $parsed->body_raw();
> > > +        }
> > > +
> > > +        push @payloads, $payload;
> > > +    }
> > > +
> > > +    return \@payloads;
> > > +}
> > > +
> > > +#----------------------------------------------------------------------
> > > +
> > > +=head2 normalize( $STRING )
> > > +
> > > +This imitates the corresponding object method in F<digest.py>.
> > > +It modifies C<$STRING> in-place.
> > > +
> > > +As with the original implementation, if C<$STRING> contains (decoded)
> > > +Unicode characters, those characters will be parsed accordingly. So:
> > > +
> > > +    $str = "123\xc2\xa0";   # [ c2 a0 ] == \u00a0, non-breaking space
> > > +
> > > +    normalize($str);
> > > +
> > > +The above will leave C<$str> alone, but this:
> > > +
> > > +    utf8::decode($str);
> > > +
> > > +    normalize($str);
> > > +
> > > +??? will trim off the last two bytes from C<$str>.
> > > +
> > > +=cut
> > > +
> > > +sub normalize {    ## no critic qw( Subroutines::RequireArgUnpacking )
> > > +
> > > +    # NULs are bad, mm-kay?
> > > +    $_[0] =~ tr<\0><>d;
> > > +
> > > +    # NB: Python???s \s without re.UNICODE is the same as Perl???s \s
> > > +    # with the /a modifier.
> > > +    #
> > > +    # https://docs.python.org/2/library/re.html
> > > +    # https://perldoc.perl.org/perlrecharclass.html#Backslash-sequences
> > > +
> > > +    # Python: re.compile(r'\S{10,}')
> > > +    $_[0] =~ s<\S{10,}><>ag;
> > > +
> > > +    # Python: re.compile(r'\S+@\S+')
> > > +    $_[0] =~ s<\S+ @ \S+><>agx;
> > > +
> > > +    # Python: re.compile(r'[a-z]+:\S+', re.IGNORECASE)
> > > +    $_[0] =~ s<[a-zA-Z]+ : \S+><>agx;
> > > +
> > > +    # (from digest.py ???)
> > > +    # Make sure we do the whitespace last because some of the previous
> > > +    # patterns rely on whitespace.
> > > +    $_[0] =~ tr< \x09-\x0d><>d;
> > > +
> > > +    # This is fun. digest.py???s normalize() does a non-UNICODE whitespace
> > > +    # strip, then calls strip() on the string, which *will* strip Unicode
> > > +    # whitespace from the ends.
> > > +    $_[0] =~ s<\A\s+><>;
> > > +    $_[0] =~ s<\s+\z><>;
> > > +
> > > +    return;
> > > +}
> > > +
> > > +#----------------------------------------------------------------------
> > > +
> > > +=head2 $yn = should_handle_line( $STRING )
> > > +
> > > +This imitates the corresponding object method in F<digest.py>.
> > > +It returns a boolean.
> > > +
> > > +=cut
> > > +
> > > +sub should_handle_line {
> > > +    return $_[0] && length( $_[0] ) >= _MIN_LINE_LENGTH();
> > > +}
> > > +
> > > +#----------------------------------------------------------------------
> > > +
> > > +=head2 $sr = assemble_lines( \@LINES )
> > > +
> > > +This assembles a string buffer out of @LINES. The string is the buffer
> > > +of octets that will be hashed to produce the message digest.
> > > +
> > > +Each member of @LINES is expected to be an B<octet string>, not a
> > > +character string.
> > > +
> > > +=cut
> > > +
> > > +sub assemble_lines {
> > > +    my ($lines_ar) = @_;
> > > +
> > > +    if ( @$lines_ar <= _ATOMIC_NUM_LINES() ) {
> > > +
> > > +        # cf. handle_atomic() in digest.py
> > > +        return \join( q<>, @$lines_ar );
> > > +    }
> > > +
> > > +    #----------------------------------------------------------------------
> > > +    # cf. handle_atomic() in digest.py
> > > +
> > > +    my $str = q<>;
> > > +
> > > +    for my $ofs_len ( _HASH_SPEC() ) {
> > > +        my ( $offset, $length ) = @$ofs_len;
> > > +
> > > +        for my $i ( 0 .. ( $length - 1 ) ) {
> > > +            my $idx = int( $offset * @$lines_ar / 100 ) + $i;
> > > +
> > > +            next if !defined $lines_ar->[$idx];
> > > +
> > > +            $str .= $lines_ar->[$idx];
> > > +        }
> > > +    }
> > > +
> > > +    return \$str;
> > > +}
> > > +
> > > +#----------------------------------------------------------------------
> > > +
> > > +=head2 ($main, $sub, $encoding, $checkval) = parse_content_type( $CONTENT_TYPE )
> > > +
> > > +=cut
> > > +
> > > +use constant _QUOTED_PRINTABLE_NAMES => (
> > > +    "quopri-codec",
> > > +    "quopri",
> > > +    "quoted-printable",
> > > +    "quotedprintable",
> > > +);
> > > +
> > > +# Make Encode::decode() ignore anything that doesn???t fit the
> > > +# given encoding.
> > > +use constant _encode_check_ignore => q<>;
> > > +
> > > +sub parse_content_type {
> > > +    my ($content_type) = @_;
> > > +
> > > +    $Email::MIME::ContentType::STRICT_PARAMS = 0;
> > > +    my $ct_parse = Email::MIME::ContentType::parse_content_type(
> > > +        $content_type,
> > > +    );
> > > +
> > > +    my $main = $ct_parse->{'type'}    || q<>;
> > > +    my $sub  = $ct_parse->{'subtype'} || q<>;
> > > +
> > > +    my $encoding = $ct_parse->{'attributes'}{'charset'};
> > > +
> > > +    my $checkval;
> > > +
> > > +    if ($encoding) {
> > > +
> > > +        # Lower-case everything, convert underscore to dash, and remove NUL.
> > > +        $encoding =~ tr<A-Z_\0><a-z->d;
> > > +
> > > +        # Apparently pyzor accommodates messages that put the transfer
> > > +        # encoding in the Content-Type.
> > > +        if ( grep { $_ eq $encoding } _QUOTED_PRINTABLE_NAMES() ) {
> > > +            $checkval = Encode::FB_CROAK();
> > > +        }
> > > +    }
> > > +    else {
> > > +        $encoding = 'ascii';
> > > +    }
> > > +
> > > +    # Match Python .decode()???s 'ignore' behavior
> > > +    $checkval ||= \&_encode_check_ignore;
> > > +
> > > +    return ( $main, $sub, $encoding, $checkval );
> > > +}
> > > +
> > > +#----------------------------------------------------------------------
> > > +
> > > +=head2 @lines = splitlines( $TEXT )
> > > +
> > > +Imitates C<str.splitlines()>. (cf. C<pydoc str>)
> > > +
> > > +Returns a plain list in list context. Returns the number of
> > > +items to be returned in scalar context.
> > > +
> > > +=cut
> > > +
> > > +sub splitlines {
> > > +    return split m<\r\n?|\n>, $_[0];
> > > +}
> > > +
> > > +1;
> > > diff --git a/lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm b/lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm
> > > new file mode 100644
> > > index 0000000..2617b4a
> > > --- /dev/null
> > > +++ b/lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm
> > > @@ -0,0 +1,177 @@
> > > +package Mail::SpamAssassin::Pyzor::Digest::StripHtml;
> > > +
> > > +# Copyright 2018 cPanel, LLC.
> > > +# All rights reserved.
> > > +# http://cpanel.net
> > > +#
> > > +# <@LICENSE>
> > > +# Licensed to the Apache Software Foundation (ASF) under one or more
> > > +# contributor license agreements.  See the NOTICE file distributed with
> > > +# this work for additional information regarding copyright ownership.
> > > +# The ASF licenses this file to you under the Apache License, Version 2.0
> > > +# (the "License"); you may not use this file except in compliance with
> > > +# the License.  You may obtain a copy of the License at:
> > > +#
> > > +#     http://www.apache.org/licenses/LICENSE-2.0
> > > +#
> > > +# Unless required by applicable law or agreed to in writing, software
> > > +# distributed under the License is distributed on an "AS IS" BASIS,
> > > +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> > > +# See the License for the specific language governing permissions and
> > > +# limitations under the License.
> > > +# </...@LICENSE>
> > > +#
> > > +
> > > +use strict;
> > > +use warnings;
> > > +
> > > +=encoding utf-8
> > > +
> > > +=head1 NAME
> > > +
> > > +Mail::SpamAssassin::Pyzor::Digest::StripHtml
> > > +
> > > +=head1 SYNOPSIS
> > > +
> > > +    my $stripped = Mail::SpamAssassin::Pyzor::Digest::StripHtml::strip($html);
> > > +
> > > +=head1 DESCRIPTION
> > > +
> > > +This module attempts to duplicate pyzor???s HTML-stripping logic.
> > > +
> > > +=head1 ACCURACY
> > > +
> > > +This library cannot achieve 100%, bug-for-bug parity with pyzor
> > > +because to do so would require duplicating Python???s own HTML parsing
> > > +library. Since that library???s output has changed over time, and those
> > > +changes in turn affect pyzor, it???s literally impossible to arrive at
> > > +a single, fully-compatible reimplementation.
> > > +
> > > +That said, all known divergences between pyzor and this library involve
> > > +invalid HTML as input.
> > > +
> > > +Please open bug reports for any divergences you identify, particularly
> > > +if the input is valid HTML.
> > > +
> > > +=cut
> > > +
> > > +#----------------------------------------------------------------------
> > > +
> > > +use HTML::Parser ();
> > > +
> > > +our $VERSION = '0.03';
> > > +
> > > +#----------------------------------------------------------------------
> > > +
> > > +=head1 FUNCTIONS
> > > +
> > > +=head2 $stripped = strip( $HTML )
> > > +
> > > +Give it some HTML, and it???ll give back the stripped text.
> > > +
> > > +In B<general>, the stripping consists of removing tags as well as
> > > +C<E<lt>scriptE<gt>> and C<E<lt>styleE<gt>> elements; however, it also
> > > +removes HTML entities.
> > > +
> > > +This tries very hard to duplicate pyzor???s behavior with invalid HTML.
> > > +
> > > +=cut
> > > +
> > > +sub strip {
> > > +    my ($html) = @_;
> > > +
> > > +    $html =~ s<\A\s+><>;
> > > +    $html =~ s<\s+\z><>;
> > > +
> > > +    my $p = HTML::Parser->new( api_version => 3 );
> > > +
> > > +    my @pieces;
> > > +
> > > +    my $accumulate = 1;
> > > +
> > > +    $p->handler(
> > > +        start => sub {
> > > +            my ($tagname) = @_;
> > > +
> > > +            $accumulate = 0 if $tagname eq 'script';
> > > +            $accumulate = 0 if $tagname eq 'style';
> > > +
> > > +            return;
> > > +        },
> > > +        'tagname',
> > > +    );
> > > +
> > > +    $p->handler(
> > > +        end => sub {
> > > +            $accumulate = 1;
> > > +            return;
> > > +        }
> > > +    );
> > > +
> > > +    $p->handler(
> > > +        text => sub {
> > > +            my ($copy) = @_;
> > > +
> > > +            return if !$accumulate;
> > > +
> > > +            # pyzor???s HTML parser discards HTML entities. On top of that,
> > > +            # we need to match, as closely as possible, pyzor???s handling of
> > > +            # invalid HTML entities ??? which is a function of Python???s
> > > +            # standard HTML parsing library. This will probably never be
> > > +            # fully compatible with the pyzor, but we can get it close.
> > > +
> > > +            # The original is:
> > > +            #
> > > +            #   re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
> > > +            #
> > > +            # The parsing loop then ???backs up??? one byte if the last
> > > +            # character isn???t a ???;???. We use a look-ahead assertion to
> > > +            # mimic that behavior.
> > > +            $copy =~ s<\&\# (?:[0-9]+ | [xX][0-9a-fA-F]+) (?: ; | \z | (?=[^0-9a-fA-F]) )>< >gx;
> > > +
> > > +            # The original is:
> > > +            #
> > > +            #   re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
> > > +            #
> > > +            # We again use a look-ahead assertion to mimic Python.
> > > +            $copy =~ s<\& [a-zA-Z] [-.a-zA-Z0-9]* (?: ; | \z | (?=[^a-zA-Z0-9]) )>< >gx;
> > > +
> > > +            # Python???s HTMLParser aborts its parsing loop when it encounters
> > > +            # an invalid numeric reference.
> > > +            $copy =~ s<\&\#
> > > +                (?:
> > > +                    [^0-9xX]        # anything but the expected first char
> > > +                    |
> > > +                    [0-9]+[a-fA-F]  # hex within decimal
> > > +                    |
> > > +                    [xX][^0-9a-fA-F]
> > > +                )
> > > +                (.*)
> > > +            ><
> > > +                ( -1 == index($1, ';') ) ? q<> : '&#'
> > > +            >exs;
> > > +
> > > +            # Python???s HTMLParser treats invalid entities as incomplete
> > > +            $copy =~ s<(\&\#?)><$1 >gx;
> > > +
> > > +            $copy =~ s<\A\s+><>;
> > > +            $copy =~ s<\s+\z><>;
> > > +
> > > +            push @pieces, \$copy if length $copy;
> > > +        },
> > > +        'text,tagname',
> > > +    );
> > > +
> > > +    $p->parse($html);
> > > +    $p->eof();
> > > +
> > > +    my $payload = join( q< >, map { $$_ } @pieces );
> > > +
> > > +    # Convert all sequences of whitespace OTHER THAN non-breaking spaces to
> > > +    # plain spaces.
> > > +    $payload =~ s<[^\S\x{a0}]+>< >g;
> > > +
> > > +    return $payload;
> > > +}
> > > +
> > > +1;
> > > diff --git a/t/pyzor.t b/t/pyzor.t
> > > index 891f38d..e4ef83f 100755
> > > --- a/t/pyzor.t
> > > +++ b/t/pyzor.t
> > > @@ -3,12 +3,9 @@
> > >  use lib '.'; use lib 't';
> > >  use SATest; sa_t_init("pyzor");
> > >  
> > > -use constant HAS_PYZOR => eval { $_ = untaint_cmd("which pyzor"); chomp; -x };
> > > -
> > >  use Test::More;
> > >  plan skip_all => "Net tests disabled" unless conf_bool('run_net_tests');
> > > -plan skip_all => "Pyzor executable not found in path" unless HAS_PYZOR;
> > > -plan tests => 8;
> > > +plan tests => 5;
> > >  
> > >  diag('Note: Failures may not be an SpamAssassin bug, as Pyzor tests can fail due to problems with the Pyzor servers.');
> > >  
> > > @@ -30,7 +27,7 @@ tstprefs ("
> > >  sarun ("-t < data/spam/pyzor", \&patterns_run_cb);
> > >  ok_all_patterns();
> > >  # Same with fork
> > > -sarun ("--cf='pyzor_fork 1' -t < data/spam/pyzor", \&patterns_run_cb);
> > > +sarun ("-t < data/spam/pyzor", \&patterns_run_cb);
> > >  ok_all_patterns();
> > >  
> > >  #TESTING FOR HAM
> > > @@ -44,7 +41,3 @@ ok_all_patterns();
> > >  
> > >  sarun ("-D pyzor -t < data/nice/001 2>&1", \&patterns_run_cb);
> > >  ok_all_patterns();
> > > -# same with fork
> > > -sarun ("-D pyzor --cf='pyzor_fork 1' -t < data/nice/001 2>&1", \&patterns_run_cb);
> > > -ok_all_patterns();
> > > -
> > 
> > 



Re: new Pyzor implementation

Posted by Giovanni Bechis <gi...@paclan.it>.
Once committed, code will be no more developed by cPanel on CPAN
and original code will be removed.

I can work to integrate old and new Pyzor versions.

 Giovanni

On Thu, Oct 14, 2021 at 05:27:16PM +0300, Henrik K wrote:
> 
> If it's developed by cPanel in CPAN, then it should not be committed to SA,
> unless it's clearly donated to SpamAssassin and removed from CPAN.  Assuming
> we have developer resources and will to take it aboard.
> 
> As it is, Plugin/Pyzor.pm should have an option to choose which one to use,
> as it makes no sense to ditch support for the widely installed original
> Pyzor.
> 
> 
> On Thu, Oct 14, 2021 at 04:15:13PM +0200, Giovanni Bechis wrote:
> > Hi,
> > cPanel has developed a native Perl Pyzor implementation for SpamAssassin
> > and a diff against SpamAssassin 4.0 follows.
> > Atm I am using it in production on a small server, more tests and
> > opinions are welcome.
> > 
> > Original cPanel code is at https://metacpan.org/pod/Mail::Pyzor.
> > 
> >  Cheers
> >   Giovanni
> > 
> > diff --git a/MANIFEST b/MANIFEST
> > index 25d0192..2d9588c 100644
> > --- a/MANIFEST
> > +++ b/MANIFEST
> > @@ -126,6 +126,11 @@ lib/Mail/SpamAssassin/Plugin/WLBLEval.pm
> >  lib/Mail/SpamAssassin/Plugin/WhiteListSubject.pm
> >  lib/Mail/SpamAssassin/PluginHandler.pm
> >  lib/Mail/SpamAssassin/Plugin/URILocalBL.pm
> > +lib/Mail/SpamAssassin/Pyzor/Client.pm
> > +lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm
> > +lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm
> > +lib/Mail/SpamAssassin/Pyzor/Digest.pm
> > +lib/Mail/SpamAssassin/Pyzor.pm
> >  lib/Mail/SpamAssassin/RegistryBoundaries.pm
> >  lib/Mail/SpamAssassin/Reporter.pm
> >  lib/Mail/SpamAssassin/SQLBasedAddrList.pm
> > diff --git a/lib/Mail/SpamAssassin/Plugin/Pyzor.pm b/lib/Mail/SpamAssassin/Plugin/Pyzor.pm
> > index 3efd4b4..e4c9c05 100644
> > --- a/lib/Mail/SpamAssassin/Plugin/Pyzor.pm
> > +++ b/lib/Mail/SpamAssassin/Plugin/Pyzor.pm
> > @@ -36,17 +36,13 @@ package Mail::SpamAssassin::Plugin::Pyzor;
> >  
> >  use Mail::SpamAssassin::Plugin;
> >  use Mail::SpamAssassin::Logger;
> > -use Mail::SpamAssassin::Timeout;
> > -use Mail::SpamAssassin::Util qw(untaint_var untaint_file_path
> > -                                proc_status_ok exit_status_str);
> > +use Mail::SpamAssassin::Util qw(untaint_var);
> > +
> >  use strict;
> >  use warnings;
> >  # use bytes;
> >  use re 'taint';
> >  
> > -use Storable;
> > -use POSIX qw(PIPE_BUF WNOHANG _exit);
> > -
> >  our @ISA = qw(Mail::SpamAssassin::Plugin);
> >  
> >  sub new {
> > @@ -78,7 +74,7 @@ sub set_config {
> >    my ($self, $conf) = @_;
> >    my @cmds;
> >  
> > -=head1 USER OPTIONS
> > +=head1 ADMINISTRATOR OPTIONS
> >  
> >  =over 4
> >  
> > @@ -95,22 +91,7 @@ Whether to use Pyzor, if it is available.
> >      type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL
> >    });
> >  
> > -=item pyzor_fork (0|1)		(default: 0)
> > -
> > -Instead of running Pyzor synchronously, fork separate process for it and
> > -read the results in later (similar to async DNS lookups).  Increases
> > -throughput.  Experimental.
> > -
> > -=cut
> > -
> > -  push(@cmds, {
> > -    setting => 'pyzor_fork',
> > -    is_admin => 1,
> > -    default => 0,
> > -    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC,
> > -  });
> > -
> > -=item pyzor_count_min NUMBER	(default: 5)
> > +=item pyzor_count_min NUMBER		(default: 5)
> >  
> >  This option sets how often a message's body checksum must have been
> >  reported to the Pyzor server before SpamAssassin will consider the Pyzor
> > @@ -128,54 +109,8 @@ set this to a relatively low value, e.g. C<5>.
> >      type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
> >    });
> >  
> > -  # Deprecated setting, the name makes no sense!
> > -  push (@cmds, {
> > -    setting => 'pyzor_max',
> > -    is_admin => 1,
> > -    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC,
> > -    code => sub {
> > -      my ($self, $key, $value, $line) = @_;
> > -      warn("deprecated setting used, change pyzor_max to pyzor_count_min\n");
> > -      if ($value !~ /^\d+$/) {
> > -        return $Mail::SpamAssassin::Conf::INVALID_VALUE;
> > -      }
> > -      $self->{pyzor_count_min} = $value;
> > -    }
> > -  });
> > -
> > -=item pyzor_whitelist_min NUMBER	(default: 10)
> > -
> > -This option sets how often a message's body checksum must have been
> > -whitelisted to the Pyzor server for SpamAssassin to consider ignoring the
> > -result.  Final decision is made by pyzor_whitelist_factor.
> > -
> > -=cut
> > -
> > -  push (@cmds, {
> > -    setting => 'pyzor_whitelist_min',
> > -    is_admin => 1,
> > -    default => 10,
> > -    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
> > -  });
> > -
> > -=item pyzor_whitelist_factor NUMBER	(default: 0.2)
> > -
> > -Ignore Pyzor result if REPORTCOUNT x NUMBER >= pyzor_whitelist_min.
> > -For default setting this means: 50 reports requires 10 whitelistings.
> > -
> > -=cut
> > -
> > -  push (@cmds, {
> > -    setting => 'pyzor_whitelist_factor',
> > -    is_admin => 1,
> > -    default => 0.2,
> > -    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
> > -  });
> > -
> >  =back
> >  
> > -=head1 ADMINISTRATOR OPTIONS
> > -
> >  =over 4
> >  
> >  =item pyzor_timeout n		(default: 5)
> > @@ -210,478 +145,182 @@ removing one of them.
> >      type => $Mail::SpamAssassin::Conf::CONF_TYPE_DURATION
> >    });
> >  
> > -=item pyzor_options options
> > +=item pyzor_whitelist_min NUMBER        (default: 10)
> >  
> > -Specify additional options to the pyzor(1) command. Please note that only
> > -characters in the range [0-9A-Za-z =,._/-] are allowed for security reasons.
> > +This option sets how often a message's body checksum must have been
> > +whitelisted to the Pyzor server for SpamAssassin to consider ignoring the
> > +result.  Final decision is made by pyzor_whitelist_factor.
> >  
> >  =cut
> >  
> >    push (@cmds, {
> > -    setting => 'pyzor_options',
> > +    setting => 'pyzor_whitelist_min',
> >      is_admin => 1,
> > -    default => '',
> > -    type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING,
> > -    code => sub {
> > -      my ($self, $key, $value, $line) = @_;
> > -      if ($value !~ m{^([0-9A-Za-z =,._/-]+)$}) {
> > -	return $Mail::SpamAssassin::Conf::INVALID_VALUE;
> > -      }
> > -      $self->{pyzor_options} = $1;
> > -    }
> > +    default => 10,
> > +    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
> >    });
> >  
> > -=item pyzor_path STRING
> > +=item pyzor_whitelist_factor NUMBER     (default: 0.2)
> >  
> > -This option tells SpamAssassin specifically where to find the C<pyzor>
> > -client instead of relying on SpamAssassin to find it in the current
> > -PATH.  Note that if I<taint mode> is enabled in the Perl interpreter,
> > -you should use this, as the current PATH will have been cleared.
> > +Ignore Pyzor result if REPORTCOUNT x NUMBER >= pyzor_whitelist_min.
> > +For default setting this means: 50 reports requires 10 whitelistings.
> >  
> >  =cut
> >  
> >    push (@cmds, {
> > -    setting => 'pyzor_path',
> > +    setting => 'pyzor_whitelist_factor',
> >      is_admin => 1,
> > -    default => undef,
> > -    type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING,
> > -    code => sub {
> > -      my ($self, $key, $value, $line) = @_;
> > -      if (!defined $value || !length $value) {
> > -	return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE;
> > -      }
> > -      $value = untaint_file_path($value);
> > -      if (!-x $value) {
> > -	info("config: pyzor_path \"$value\" isn't an executable");
> > -	return $Mail::SpamAssassin::Conf::INVALID_VALUE;
> > -      }
> > -
> > -      $self->{pyzor_path} = $value;
> > -    }
> > +    default => 0.2,
> > +    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
> >    });
> >  
> >    $conf->{parser}->register_commands(\@cmds);
> >  }
> >  
> >  sub is_pyzor_available {
> > -  my ($self) = @_;
> > +    my ($self) = @_;
> >  
> > -  my $pyzor = $self->{main}->{conf}->{pyzor_path} ||
> > -    Mail::SpamAssassin::Util::find_executable_in_env_path('pyzor');
> > -
> > -  unless ($pyzor && -x $pyzor) {
> > -    dbg("pyzor: no pyzor executable found");
> > -    $self->{pyzor_available} = 0;
> > -    return 0;
> > -  }
> > -
> > -  # remember any found pyzor
> > -  $self->{main}->{conf}->{pyzor_path} = $pyzor;
> > -
> > -  dbg("pyzor: pyzor is available: $pyzor");
> > -  return 1;
> > +    local $@;
> > +    eval {
> > +        require Mail::SpamAssassin::Pyzor::Digest;
> > +        require Mail::SpamAssassin::Pyzor::Client;
> > +    };
> > +    return $@ ? 0 : 1;
> >  }
> >  
> > -sub finish_parsing_start {
> > -  my ($self, $opts) = @_;
> > +sub get_pyzor_interface {
> > +  my ($self) = @_;
> >  
> > -  # If forking, hard adjust priority -100 to launch early
> > -  # Find rulenames from eval_to_rule mappings
> > -  if ($opts->{conf}->{pyzor_fork}) {
> > -    foreach (@{$opts->{conf}->{eval_to_rule}->{check_pyzor}}) {
> > -      dbg("pyzor: adjusting rule $_ priority to -100");
> > -      $opts->{conf}->{priority}->{$_} = -100;
> > -    }
> > +  if (!$self->{main}->{conf}->{use_pyzor}) {
> > +    dbg("pyzor: use_pyzor option not enabled, disabling Pyzor");
> > +    $self->{pyzor_interface} = "disabled";
> > +    $self->{pyzor_available} = 0;
> > +  }
> > +  elsif ($self->is_pyzor_available()) {
> > +    $self->{pyzor_interface} = "pyzor";
> > +    $self->{pyzor_available} = 1;
> > +  }
> > +  else {
> > +    dbg("pyzor: no pyzor found, disabling Pyzor");
> > +    $self->{pyzor_available} = 0;
> >    }
> >  }
> >  
> >  sub check_pyzor {
> > -  my ($self, $pms, $full) = @_;
> > -
> > -  return 0 if !$self->{pyzor_available};
> > -  return 0 if !$self->{main}->{conf}->{use_pyzor};
> > -
> > -  return 0 if $pms->{pyzor_running};
> > -  $pms->{pyzor_running} = 1;
> > -
> > -  return 0 if !$self->is_pyzor_available();
> > -
> > -  my $timer = $self->{main}->time_method("check_pyzor");
> > +  my ($self, $permsgstatus, $full) = @_;
> >  
> >    # initialize valid tags
> > -  $pms->{tag_data}->{PYZOR} = '';
> > -
> > -  # create fulltext tmpfile now (before possible forking)
> > -  $pms->{pyzor_tmpfile} = $pms->create_fulltext_tmpfile();
> > -
> > -  ## non-forking method
> > -
> > -  if (!$self->{main}->{conf}->{pyzor_fork}) {
> > -    my @results = $self->pyzor_lookup($pms);
> > -    return $self->_check_result($pms, \@results);
> > -  }
> > -
> > -  ## forking method
> > -
> > -  $pms->{pyzor_rulename} = $pms->get_current_eval_rule_name();
> > -  $pms->rule_pending($pms->{pyzor_rulename}); # mark async
> > -
> > -  # create socketpair for communication
> > -  $pms->{pyzor_backchannel} = Mail::SpamAssassin::SubProcBackChannel->new();
> > -  my $back_selector = '';
> > -  $pms->{pyzor_backchannel}->set_selector(\$back_selector);
> > -  eval {
> > -    $pms->{pyzor_backchannel}->setup_backchannel_parent_pre_fork();
> > -  } or do {
> > -    dbg("pyzor: backchannel pre-setup failed: $@");
> > -    delete $pms->{pyzor_backchannel};
> > -    return 0;
> > -  };
> > +  $permsgstatus->{tag_data}->{PYZOR} = "";
> >  
> > -  my $pid = fork();
> > -  if (!defined $pid) {
> > -    info("pyzor: child fork failed: $!");
> > -    delete $pms->{pyzor_backchannel};
> > -    return 0;
> > -  }
> > -  if (!$pid) {
> > -    $0 = "$0 (pyzor)";
> > -    $SIG{CHLD} = 'DEFAULT';
> > -    $SIG{PIPE} = 'IGNORE';
> > -    $SIG{$_} = sub {
> > -      eval { dbg("pyzor: child process $$ caught signal $_[0]"); };
> > -      _exit(6);  # avoid END and destructor processing
> > -      kill('KILL',$$);  # still kicking? die!
> > -      } foreach qw(INT HUP TERM TSTP QUIT USR1 USR2);
> > -    dbg("pyzor: child process $$ forked");
> > -    $pms->{pyzor_backchannel}->setup_backchannel_child_post_fork();
> > -    my @results = $self->pyzor_lookup($pms);
> > -    my $backmsg;
> > -    eval {
> > -      $backmsg = Storable::freeze(\@results);
> > -    };
> > -    if ($@) {
> > -      dbg("pyzor: child return value freeze failed: $@");
> > -      _exit(0); # avoid END and destructor processing
> > -    }
> > -    if (!syswrite($pms->{pyzor_backchannel}->{parent}, $backmsg)) {
> > -      dbg("pyzor: child backchannel write failed: $!");
> > -    }
> > -    _exit(0); # avoid END and destructor processing
> > -  }
> > -
> > -  $pms->{pyzor_pid} = $pid;
> > +  my $timer = $self->{main}->time_method("check_pyzor");
> >  
> > -  eval {
> > -    $pms->{pyzor_backchannel}->setup_backchannel_parent_post_fork($pid);
> > -  } or do {
> > -    dbg("pyzor: backchannel post-setup failed: $@");
> > -    delete $pms->{pyzor_backchannel};
> > -    return 0;
> > -  };
> > +  $self->get_pyzor_interface();
> > +  return 0 unless $self->{pyzor_available};
> >  
> > -  return 0;
> > +  return $self->pyzor_lookup($permsgstatus, $full);
> >  }
> >  
> >  sub pyzor_lookup {
> > -  my ($self, $pms) = @_;
> > -
> > -  my $conf = $self->{main}->{conf};
> > -  my $timeout = $conf->{pyzor_timeout};
> > -
> > -  # note: not really tainted, this came from system configuration file
> > -  my $path = untaint_file_path($conf->{pyzor_path});
> > -  my $opts = untaint_var($conf->{pyzor_options}) || '';
> > -
> > -  $pms->enter_helper_run_mode();
> > -
> > -  my $pid;
> > -  my @resp;
> > -  my $timer = Mail::SpamAssassin::Timeout->new(
> > -           { secs => $timeout, deadline => $pms->{master_deadline} });
> > -  my $err = $timer->run_and_catch(sub {
> > -    local $SIG{PIPE} = sub { die "__brokenpipe__ignore__\n" };
> > -
> > -    dbg("pyzor: opening pipe: ".
> > -      join(' ', $path, $opts, "check", "<".$pms->{pyzor_tmpfile}));
> > -
> > -    $pid = Mail::SpamAssassin::Util::helper_app_pipe_open(*PYZOR,
> > -	$pms->{pyzor_tmpfile}, 1, $path, split(' ', $opts), "check");
> > -    $pid or die "$!\n";
> > -
> > -    # read+split avoids a Perl I/O bug (Bug 5985)
> > -    my($inbuf, $nread);
> > -    my $resp = '';
> > -    while ($nread = read(PYZOR, $inbuf, 8192)) { $resp .= $inbuf }
> > -    defined $nread  or die "error reading from pipe: $!";
> > -    @resp = split(/^/m, $resp, -1);
> > -
> > -    my $errno = 0;
> > -    close PYZOR or $errno = $!;
> > -    if (proc_status_ok($?, $errno)) {
> > -      dbg("pyzor: [%s] finished successfully", $pid);
> > -    } elsif (proc_status_ok($?, $errno, 0, 1)) {  # sometimes it exits with 1
> > -      dbg("pyzor: [%s] finished: %s", $pid, exit_status_str($?, $errno));
> > -    } else {
> > -      info("pyzor: [%s] error: %s", $pid, exit_status_str($?, $errno));
> > -    }
> > -
> > -  });
> > -
> > -  if (defined(fileno(*PYZOR))) {  # still open
> > -    if ($pid) {
> > -      if (kill('TERM', $pid)) {
> > -        dbg("pyzor: killed stale helper [$pid]");
> > -      } else {
> > -        dbg("pyzor: killing helper application [$pid] failed: $!");
> > -      }
> > -    }
> > -    my $errno = 0;
> > -    close PYZOR or $errno = $!;
> > -    proc_status_ok($?, $errno)
> > -      or info("pyzor: [%s] error: %s", $pid, exit_status_str($?, $errno));
> > -  }
> > -
> > -  $pms->leave_helper_run_mode();
> > -
> > -  if ($timer->timed_out()) {
> > -    dbg("pyzor: check timed out after $timeout seconds");
> > -    return ();
> > -  } elsif ($err) {
> > -    chomp $err;
> > -    info("pyzor: check failed: $err");
> > -    return ();
> > -  }
> > -
> > -  return @resp;
> > -}
> > -
> > -sub check_tick {
> > -  my ($self, $opts) = @_;
> > -  $self->_check_forked_result($opts->{permsgstatus}, 0);
> > -}
> > -
> > -sub check_cleanup {
> > -  my ($self, $opts) = @_;
> > -  $self->_check_forked_result($opts->{permsgstatus}, 1);
> > -}
> > -
> > -sub _check_forked_result {
> > -  my ($self, $pms, $finish) = @_;
> > -
> > -  return 0 if !$pms->{pyzor_backchannel};
> > -  return 0 if !$pms->{pyzor_pid};
> > +    my ( $self, $permsgstatus, $fulltext ) = @_;
> > +    my $conf = $self->{main}->{conf};
> > +    my $timeout = $conf->{pyzor_timeout};
> > +
> > +    my $client = ( $self->{'_pyzor_client'} ||= Mail::SpamAssassin::Pyzor::Client->new( 'timeout' => $timeout ) );
> > +    my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $fulltext );
> > +
> > +    local $@;
> > +    my $ref = eval { $client->check($digest); };
> > +    dbg("pyzor: got response: $client->{'_server_host'}");
> > +    # $client reply must be an hash
> > +    return 0 if (not (ref $ref eq ref {}));
> > +    if ($@) {
> > +        my $err = $@;
> >  
> > -  my $timer = $self->{main}->time_method("check_pyzor");
> > +        $err = eval { $err->get_message() } || $err;
> >  
> > -  $pms->{pyzor_abort} = $pms->{deadline_exceeded} || $pms->{shortcircuited};
> > -
> > -  my $kid_pid = $pms->{pyzor_pid};
> > -  # if $finish, force waiting for the child
> > -  my $pid = waitpid($kid_pid, $finish && !$pms->{pyzor_abort} ? 0 : WNOHANG);
> > -  if ($pid == 0) {
> > -    #dbg("pyzor: child process $kid_pid not finished yet, trying later");
> > -    if ($pms->{pyzor_abort}) {
> > -      dbg("pyzor: bailing out due to deadline/shortcircuit");
> > -      kill('TERM', $kid_pid);
> > -      if (waitpid($kid_pid, WNOHANG) == 0) {
> > -        sleep(1);
> > -        if (waitpid($kid_pid, WNOHANG) == 0) {
> > -          dbg("pyzor: child process $kid_pid still alive, KILL");
> > -          kill('KILL', $kid_pid);
> > -          waitpid($kid_pid, 0);
> > +        warn("pyzor: check failed: $err\n");
> > +        return 0;
> > +    } elsif ( defined $ref->{'Code'} and $ref->{'Code'} ne 200 ) {
> > +        if(defined $ref->{'Code'} and defined $ref->{'Diag'}) {
> > +          dbg("pyzor: check failed with invalid code: $ref->{'Code'}: $ref->{'Diag'}");
> > +        } else {
> > +          dbg("pyzor: check failed with undefined code");
> >          }
> > -      }
> > -      delete $pms->{pyzor_pid};
> > -      delete $pms->{pyzor_backchannel};
> > +        return 0;
> >      }
> > -    return 0;
> > -  } elsif ($pid == -1) {
> > -    # child does not exist?
> > -    dbg("pyzor: child process $kid_pid already handled?");
> > -    delete $pms->{pyzor_backchannel};
> > -    return 0;
> > -  }
> >  
> > -  $pms->rule_ready($pms->{pyzor_rulename}); # mark rule ready for metas
> > +    my $pyzor_count       = untaint_var($ref->{'Count'}) + 0;
> > +    my $pyzor_whitelisted = untaint_var($ref->{'WL-Count'}) + 0;
> > +    my $count_min = $conf->{pyzor_count_min};
> > +    my $wl_min = $conf->{pyzor_whitelist_min};
> >  
> > -  dbg("pyzor: child process $kid_pid finished, reading results");
> > +    my $wl_limit = $pyzor_whitelisted >= $wl_min ?
> > +      $pyzor_count * $conf->{pyzor_whitelist_factor} : 0;
> >  
> > -  my $backmsg;
> > -  my $ret = sysread($pms->{pyzor_backchannel}->{latest_kid_fh}, $backmsg, PIPE_BUF);
> > -  if (!defined $ret || $ret == 0) {
> > -    dbg("pyzor: could not read result from child: ".($ret == 0 ? 0 : $!));
> > -    delete $pms->{pyzor_backchannel};
> > -    return 0;
> > -  }
> > -
> > -  delete $pms->{pyzor_backchannel};
> > +    $permsgstatus->set_tag('PYZOR', "Reported $pyzor_count times, whitelisted $pyzor_whitelisted times.");
> >  
> > -  my $results;
> > -  eval {
> > -    $results = Storable::thaw($backmsg);
> > -  };
> > -  if ($@) {
> > -    dbg("pyzor: child return value thaw failed: $@");
> > -    return;
> > -  }
> > -
> > -  $self->_check_result($pms, $results);
> > -}
> > +    dbg("pyzor: result: COUNT=$pyzor_count/$count_min WHITELIST=$pyzor_whitelisted/$wl_min/%.1f",
> > +      $wl_limit);
> >  
> > -sub _check_result {
> > -  my ($self, $pms, $results) = @_;
> > -
> > -  if (!@$results) {
> > -    dbg("pyzor: no response from server");
> > -    return 0;
> > -  }
> > -
> > -  my $count = 0;
> > -  my $count_wl = 0;
> > -  foreach my $res (@$results) {
> > -    chomp($res);
> > -    if ($res =~ /^Traceback/) {
> > -      info("pyzor: internal error, python traceback seen in response: $res");
> > +    # Empty body etc results in same hash, we should skip very large numbers..
> > +    if ($pyzor_count >= 1000000 || $pyzor_whitelisted >= 10000) {
> > +      dbg("pyzor: result exceeded hardcoded limits, ignoring: count/wl 1000000/10000");
> >        return 0;
> >      }
> > -    dbg("pyzor: got response: $res");
> > -    # this regexp is intended to be a little bit forgiving
> > -    if ($res =~ /^\S+\t.*?\t(\d+)\t(\d+)\s*$/) {
> > -      # until pyzor servers can sync their DBs,
> > -      # sum counts obtained from all servers
> > -      $count += untaint_var($1)+0; # crazy but needs untainting
> > -      $count_wl += untaint_var($2)+0;
> > -    } else {
> > -      # warn on failures to parse
> > -      info("pyzor: failure to parse response \"$res\"");
> > -    }
> > -  }
> > -
> > -  my $conf = $self->{main}->{conf};
> > -
> > -  my $count_min = $conf->{pyzor_count_min};
> > -  my $wl_min = $conf->{pyzor_whitelist_min};
> >  
> > -  my $wl_limit = $count_wl >= $wl_min ?
> > -    $count * $conf->{pyzor_whitelist_factor} : 0;
> > -
> > -  dbg("pyzor: result: COUNT=$count/$count_min WHITELIST=$count_wl/$wl_min/%.1f",
> > -    $wl_limit);
> > -  $pms->set_tag('PYZOR', "Reported $count times, whitelisted $count_wl times.");
> > -
> > -  # Empty body etc results in same hash, we should skip very large numbers..
> > -  if ($count >= 1000000 || $count_wl >= 10000) {
> > -    dbg("pyzor: result exceeded hardcoded limits, ignoring: count/wl 1000000/10000");
> > -    return 0;
> > -  }
> > -
> > -  # Whitelisted?
> > -  if ($wl_limit && $count_wl >= $wl_limit) {
> > -    dbg("pyzor: message whitelisted");
> > -    return 0;
> > -  }
> > +    # Whitelisted?
> > +    if ($wl_limit && $pyzor_whitelisted >= $wl_limit) {
> > +      dbg("pyzor: message whitelisted");
> > +      return 0;
> > +    }
> >  
> > -  if ($count >= $count_min) {
> > -    if ($conf->{pyzor_fork}) {
> > -      # forked needs to run got_hit()
> > -      $pms->got_hit($pms->{pyzor_rulename}, "", ruletype => 'eval');
> > +    if ( $pyzor_count >= $count_min ) {
> > +      return 1;
> >      }
> > -    return 1;
> > -  }
> >  
> > -  return 0;
> > +    return 0;
> >  }
> >  
> >  sub plugin_report {
> >    my ($self, $options) = @_;
> >  
> > -  return if !$self->{pyzor_available};
> > -  return if !$self->{main}->{conf}->{use_pyzor};
> > -  return if $options->{report}->{options}->{dont_report_to_pyzor};
> > -  return if !$self->is_pyzor_available();
> > -
> > -  # use temporary file: open2() is unreliable due to buffering under spamd
> > -  my $tmpf = $options->{report}->create_fulltext_tmpfile($options->{text});
> > -  if ($self->pyzor_report($options, $tmpf)) {
> > -    $options->{report}->{report_available} = 1;
> > -    info("reporter: spam reported to Pyzor");
> > -    $options->{report}->{report_return} = 1;
> > -  }
> > -  else {
> > -    info("reporter: could not report spam to Pyzor");
> > -  }
> > -  $options->{report}->delete_fulltext_tmpfile($tmpf);
> > +  return unless $self->{pyzor_available};
> > +  return unless $self->{main}->{conf}->{use_pyzor};
> >  
> > -  return 1;
> > +  if (!$options->{report}->{options}->{dont_report_to_pyzor} && $self->is_pyzor_available())
> > +  {
> > +    if ($self->pyzor_report($options)) {
> > +      $options->{report}->{report_available} = 1;
> > +      info("reporter: spam reported to Pyzor");
> > +      $options->{report}->{report_return} = 1;
> > +    }
> > +    else {
> > +      info("reporter: could not report spam to Pyzor");
> > +    }
> > +  }
> >  }
> >  
> >  sub pyzor_report {
> > -  my ($self, $options, $tmpf) = @_;
> > -
> > -  # note: not really tainted, this came from system configuration file
> > -  my $path = untaint_file_path($options->{report}->{conf}->{pyzor_path});
> > -  my $opts = untaint_var($options->{report}->{conf}->{pyzor_options}) || '';
> > +    my ( $self, $options ) = @_;
> >  
> > -  my $timeout = $self->{main}->{conf}->{pyzor_timeout};
> > +    my $timeout = $self->{main}->{conf}->{pyzor_timeout};
> >  
> > -  $options->{report}->enter_helper_run_mode();
> > +    my $client = ( $self->{'_pyzor_client'} ||= Mail::SpamAssassin::Pyzor::Client->new( 'timeout' => $timeout ) );
> >  
> > -  my $timer = Mail::SpamAssassin::Timeout->new({ secs => $timeout });
> > -  my $err = $timer->run_and_catch(sub {
> > +    my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $options->{'text'} );
> >  
> > -    local $SIG{PIPE} = sub { die "__brokenpipe__ignore__\n" };
> > -
> > -    dbg("pyzor: opening pipe: " . join(' ', $path, $opts, "report", "< $tmpf"));
> > -
> > -    my $pid = Mail::SpamAssassin::Util::helper_app_pipe_open(*PYZOR,
> > -	$tmpf, 1, $path, split(' ', $opts), "report");
> > -    $pid or die "$!\n";
> > -
> > -    my($inbuf,$nread,$nread_all); $nread_all = 0;
> > -    # response is ignored, just check its existence
> > -    while ( $nread=read(PYZOR,$inbuf,8192) ) { $nread_all += $nread }
> > -    defined $nread  or die "error reading from pipe: $!";
> > -
> > -    dbg("pyzor: empty response")  if $nread_all < 1;
> > -
> > -    my $errno = 0;  close PYZOR or $errno = $!;
> > -    # closing a pipe also waits for the process executing on the pipe to
> > -    # complete, no need to explicitly call waitpid
> > -    # my $child_stat = waitpid($pid,0) > 0 ? $? : undef;
> > -    if (proc_status_ok($?,$errno, 0)) {
> > -      dbg("pyzor: [%s] reporter finished successfully", $pid);
> > -    } else {
> > -      info("pyzor: [%s] reporter error: %s", $pid, exit_status_str($?,$errno));
> > +    local $@;
> > +    my $ref = eval { $client->report($digest); };
> > +    if ($@) {
> > +        warn("pyzor: report failed: $@");
> > +        return 0;
> >      }
> > -
> > -  });
> > -
> > -  $options->{report}->leave_helper_run_mode();
> > -
> > -  if ($timer->timed_out()) {
> > -    dbg("reporter: pyzor report timed out after $timeout seconds");
> > -    return 0;
> > -  }
> > -
> > -  if ($err) {
> > -    chomp $err;
> > -    if ($err eq '__brokenpipe__ignore__') {
> > -      dbg("reporter: pyzor report failed: broken pipe");
> > -    } else {
> > -      warn("reporter: pyzor report failed: $err\n");
> > +    elsif ( $ref->{'Code'} ne 200 ) {
> > +        dbg("pyzor: report failed with invalid code: $ref->{'Code'}: $ref->{'Diag'}");
> > +        return 0;
> >      }
> > -    return 0;
> > -  }
> >  
> > -  return 1;
> > +    return 1;
> >  }
> >  
> > -# Version features
> > -sub has_fork { 1 }
> > -
> >  1;
> > -
> > -=back
> > -
> > -=cut
> > diff --git a/lib/Mail/SpamAssassin/Pyzor.pm b/lib/Mail/SpamAssassin/Pyzor.pm
> > new file mode 100644
> > index 0000000..8ac27f4
> > --- /dev/null
> > +++ b/lib/Mail/SpamAssassin/Pyzor.pm
> > @@ -0,0 +1,56 @@
> > +package Mail::SpamAssassin::Pyzor;
> > +
> > +# Copyright 2018 cPanel, LLC.
> > +# All rights reserved.
> > +# http://cpanel.net
> > +#
> > +# <@LICENSE>
> > +# Licensed to the Apache Software Foundation (ASF) under one or more
> > +# contributor license agreements.  See the NOTICE file distributed with
> > +# this work for additional information regarding copyright ownership.
> > +# The ASF licenses this file to you under the Apache License, Version 2.0
> > +# (the "License"); you may not use this file except in compliance with
> > +# the License.  You may obtain a copy of the License at:
> > +#
> > +#     http://www.apache.org/licenses/LICENSE-2.0
> > +#
> > +# Unless required by applicable law or agreed to in writing, software
> > +# distributed under the License is distributed on an "AS IS" BASIS,
> > +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> > +# See the License for the specific language governing permissions and
> > +# limitations under the License.
> > +# </...@LICENSE>
> > +#
> > +
> > +use strict;
> > +use warnings;
> > +
> > +our $VERSION = '0.06_01';
> > +
> > +=encoding utf-8
> > +
> > +=head1 NAME
> > +
> > +Mail::SpamAssassin::Pyzor - Pyzor spam filtering in Perl
> > +
> > +=head1 DESCRIPTION
> > +
> > +This distribution contains Perl implementations of parts of
> > +L<Pyzor|http://pyzor.org>, a tool for use in spam email filtering.
> > +It is intended for use with L<Mail::SpamAssassin> but may be useful
> > +in other contexts.
> > +
> > +See the following modules for information on specific tools that
> > +the distribution includes:
> > +
> > +=over
> > +
> > +=item * L<Mail::SpamAssassin::Pyzor::Client>
> > +
> > +=item * L<Mail::SpamAssassin::Pyzor::Digest>
> > +
> > +=back
> > +
> > +=cut
> > +
> > +1;
> > diff --git a/lib/Mail/SpamAssassin/Pyzor/Client.pm b/lib/Mail/SpamAssassin/Pyzor/Client.pm
> > new file mode 100644
> > index 0000000..ccff868
> > --- /dev/null
> > +++ b/lib/Mail/SpamAssassin/Pyzor/Client.pm
> > @@ -0,0 +1,415 @@
> > +package Mail::SpamAssassin::Pyzor::Client;
> > +
> > +# Copyright 2018 cPanel, LLC.
> > +# All rights reserved.
> > +# http://cpanel.net
> > +#
> > +# <@LICENSE>
> > +# Licensed to the Apache Software Foundation (ASF) under one or more
> > +# contributor license agreements.  See the NOTICE file distributed with
> > +# this work for additional information regarding copyright ownership.
> > +# The ASF licenses this file to you under the Apache License, Version 2.0
> > +# (the "License"); you may not use this file except in compliance with
> > +# the License.  You may obtain a copy of the License at:
> > +#
> > +#     http://www.apache.org/licenses/LICENSE-2.0
> > +#
> > +# Unless required by applicable law or agreed to in writing, software
> > +# distributed under the License is distributed on an "AS IS" BASIS,
> > +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> > +# See the License for the specific language governing permissions and
> > +# limitations under the License.
> > +# </...@LICENSE>
> > +#
> > +
> > +use strict;
> > +use warnings;
> > +
> > +=encoding utf-8
> > +
> > +=head1 NAME
> > +
> > +Mail::SpamAssassin::Pyzor::Client - Pyzor client logic
> > +
> > +=head1 SYNOPSIS
> > +
> > +    use Mail::SpamAssassin::Pyzor::Client ();
> > +    use Mail::SpamAssassin::Pyzor::Digest ();
> > +
> > +    my $client = Mail::SpamAssassin::Pyzor::Client->new();
> > +
> > +    my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $msg );
> > +
> > +    my $check_ref = $client->check($digest);
> > +    die $check_ref->{'Diag'} if $check_ref->{'Code'} ne '200';
> > +
> > +    my $report_ref = $client->report($digest);
> > +    die $report_ref->{'Diag'} if $report_ref->{'Code'} ne '200';
> > +
> > +=head1 DESCRIPTION
> > +
> > +A bare-bones L<Pyzor|http://pyzor.org> client that currently only
> > +implements the functionality needed for L<Mail::SpamAssassin>.
> > +
> > +=head1 PROTOCOL DETAILS
> > +
> > +The Pyzor protocol is not a published standard, and there appears to be
> > +no meaningful public documentation. What follows is enough information,
> > +largely gleaned through forum posts and reverse engineering, to facilitate
> > +effective use of this module:
> > +
> > +Pyzor is an RPC-oriented, message-based protocol. Each message
> > +is a simple dictionary of 7-bit ASCII keys and values. Server responses
> > +always include at least the following:
> > +
> > +=over
> > +
> > +=item * C<Code> - Similar to HTTP status codes; anything besides C<200>
> > +is an error.
> > +
> > +=item * C<Diag> - Similar to HTTP status reasons: a text description
> > +of the status.
> > +
> > +=back
> > +
> > +(NB: There are additional standard response headers that are useful only for
> > +the protocol itself and thus are not part of this module???s returns.)
> > +
> > +=head2 Reliability
> > +
> > +Pyzor uses UDP rather than TCP, so no message is guaranteed to reach its
> > +destination. A transmission failure can happen in either the request or
> > +the response; in either case, a timeout error will result. Such errors
> > +are represented as thrown instances of L<Mail::Pyzor::X::Timeout>.
> > +
> > +=cut
> > +
> > +#----------------------------------------------------------------------
> > +
> > +our $VERSION = '0.04';
> > +
> > +our $DEFAULT_SERVER_HOST    = 'public.pyzor.org';
> > +our $DEFAULT_SERVER_PORT    = 24441;
> > +our $DEFAULT_USERNAME       = 'anonymous';
> > +our $DEFAULT_PASSWORD       = '';
> > +our $DEFAULT_OP_SPEC        = '20,3,60,3';
> > +our $PYZOR_PROTOCOL_VERSION = 2.1;
> > +our $DEFAULT_TIMEOUT        = 3.5;
> > +our $READ_SIZE              = 8192;
> > +
> > +use IO::Socket::INET ();
> > +use Digest::SHA qw(sha1 sha1_hex);
> > +
> > +my @hash_order = ( 'Op', 'Op-Digest', 'Op-Spec', 'Thread', 'PV', 'User', 'Time', 'Sig' );
> > +
> > +#----------------------------------------------------------------------
> > +
> > +=head1 CONSTRUCTOR
> > +
> > +=head2 new(%OPTS)
> > +
> > +Create a new pyzor client.
> > +
> > +=over 2
> > +
> > +=item Input
> > +
> > +%OPTS are (all optional):
> > +
> > +=over 3
> > +
> > +=item * C<server_host> - The pyzor server host to connect to (default is
> > +C<public.pyzor.org>)
> > +
> > +=item * C<server_port> - The pyzor server port to connect to (default is
> > +24441)
> > +
> > +=item * C<username> - The username to present to the pyzor server (default
> > +is C<anonymous>)
> > +
> > +=item * C<password> - The password to present to the pyzor server (default
> > +is empty)
> > +
> > +=item * C<timeout> - The maximum time, in seconds, to wait for a response
> > +from the pyzor server (defeault is 3.5)
> > +
> > +=back
> > +
> > +=item Output
> > +
> > +=over 3
> > +
> > +Returns a L<Mail::SpamAssassin::Pyzor::Client> object.
> > +
> > +=back
> > +
> > +=back
> > +
> > +=cut
> > +
> > +sub new {
> > +    my ( $class, %OPTS ) = @_;
> > +
> > +    return bless {
> > +        '_server_host' => $OPTS{'server_host'} || $DEFAULT_SERVER_HOST,
> > +        '_server_port' => $OPTS{'server_port'} || $DEFAULT_SERVER_PORT,
> > +        '_username'    => $OPTS{'username'}    || $DEFAULT_USERNAME,
> > +        '_password'    => $OPTS{'password'}    || $DEFAULT_PASSWORD,
> > +        '_op_spec'     => $DEFAULT_OP_SPEC,
> > +        '_timeout'     => $OPTS{'timeout'} || $DEFAULT_TIMEOUT,
> > +    }, $class;
> > +}
> > +
> > +#----------------------------------------------------------------------
> > +
> > +=head1 REQUEST METHODS
> > +
> > +=head2 report($digest)
> > +
> > +Report the digest of a spam message to the pyzor server. This function
> > +will throw if a messaging failure or timeout happens.
> > +
> > +=over 2
> > +
> > +=item Input
> > +
> > +=over 3
> > +
> > +=item $digest C<SCALAR>
> > +
> > +The message digest to report, as given by
> > +C<Mail::SpamAssassin::Pyzor::Digest::get()>.
> > +
> > +=back
> > +
> > +=item Output
> > +
> > +=over 3
> > +
> > +=item C<HASHREF>
> > +
> > +Returns a hashref of the standard attributes noted above.
> > +
> > +=back
> > +
> > +=back
> > +
> > +=cut
> > +
> > +sub report {
> > +    my ( $self, $digest ) = @_;
> > +
> > +    my $msg_ref = $self->_get_base_msg( 'report', $digest );
> > +
> > +    $msg_ref->{'Op-Spec'} = $self->{'_op_spec'};
> > +
> > +    return $self->_send_receive_msg($msg_ref);
> > +}
> > +
> > +=head2 check($digest)
> > +
> > +Check the digest of a message to see if
> > +the pyzor server has a report for it. This function
> > +will throw if a messaging failure or timeout happens.
> > +
> > +=over 2
> > +
> > +=item Input
> > +
> > +=over 3
> > +
> > +=item $digest C<SCALAR>
> > +
> > +The message digest to check, as given by
> > +C<Mail::SpamAssassin::Pyzor::Digest::get()>.
> > +
> > +=back
> > +
> > +=item Output
> > +
> > +=over 3
> > +
> > +=item C<HASHREF>
> > +
> > +Returns a hashref of the standard attributes noted above
> > +as well as the following:
> > +
> > +=over
> > +
> > +=item * C<Count> - The number of reports the server has received
> > +for the given digest.
> > +
> > +=item * C<WL-Count> - The number of whitelist requests the server has received
> > +for the given digest.
> > +
> > +=back
> > +
> > +=back
> > +
> > +=back
> > +
> > +=cut
> > +
> > +sub check {
> > +    my ( $self, $digest ) = @_;
> > +
> > +    return $self->_send_receive_msg( $self->_get_base_msg( 'check', $digest ) );
> > +}
> > +
> > +# ----------------------------------------
> > +
> > +sub _send_receive_msg {
> > +    my ( $self, $msg_ref ) = @_;
> > +
> > +    my $thread_id = $msg_ref->{'Thread'} or warn 'No thread ID?';
> > +
> > +    $self->_sign_msg($msg_ref);
> > +
> > +    return $self->_do_send_receive(
> > +        $self->_generate_packet_from_message($msg_ref) . "\n\n",
> > +        $thread_id,
> > +    );
> > +}
> > +
> > +sub _get_base_msg {
> > +    my ( $self, $op, $digest ) = @_;
> > +
> > +    die "Implementor error: op is required" if !$op;
> > +    die "error: digest is required"         if !$digest;
> > +
> > +    return {
> > +        'User'      => $self->{'_username'},
> > +        'PV'        => $PYZOR_PROTOCOL_VERSION,
> > +        'Time'      => time(),
> > +        'Op'        => $op,
> > +        'Op-Digest' => $digest,
> > +        'Thread'    => $self->_generate_thread_id()
> > +    };
> > +}
> > +
> > +sub _do_send_receive {
> > +    my ( $self, $packet, $thread_id ) = @_;
> > +
> > +    my $sock = $self->_get_connection_or_die();
> > +
> > +    $self->_send_packet( $sock, $packet );
> > +    my $response = $self->_receive_packet( $sock, $thread_id );
> > +
> > +    return 0 if not defined $response;
> > +
> > +    my $resp_hr = { map { ( split(m{: }) )[ 0, 1 ] } split( m{\n}, $response ) };
> > +
> > +    delete $resp_hr->{'Thread'};
> > +
> > +    my $response_pv = delete $resp_hr->{'PV'};
> > +
> > +    if ( $PYZOR_PROTOCOL_VERSION ne $response_pv ) {
> > +        warn "Unexpected protocol version ($response_pv) in Pyzor response!";
> > +    }
> > +
> > +    return $resp_hr;
> > +}
> > +
> > +sub _receive_packet {
> > +    my ( $self, $sock, $thread_id ) = @_;
> > +
> > +    my $timeout = $self->{'_timeout'} * 1000;
> > +
> > +    my $end_time = time + $self->{'_timeout'};
> > +
> > +    $sock->blocking(0);
> > +    my $response = '';
> > +    my $rout     = '';
> > +    my $rin      = '';
> > +    vec( $rin, fileno($sock), 1 ) = 1;
> > +
> > +    while (1) {
> > +        my $time_left = $end_time - time;
> > +
> > +        if ( $time_left <= 0 ) {
> > +          warn("Did not receive a response from the pyzor server $self->{'_server_host'}:$self->{'_server_port'} for $self->{'_timeout'} seconds!");
> > +          return;
> > +        }
> > +
> > +        my $bytes = sysread( $sock, $response, $READ_SIZE, length $response );
> > +        if ( !defined($bytes) && !$!{'EAGAIN'} && !$!{'EWOULDBLOCK'} ) {
> > +            warn "read from socket: $!";
> > +        }
> > +
> > +        if ( index( $response, "\n\n" ) > -1 ) {
> > +
> > +            # Reject the response unless its thread ID matches what we sent.
> > +            # This prevents confusion among concurrent Pyzor reqeusts.
> > +            if ( index( $response, "\nThread: $thread_id\n" ) != -1 ) {
> > +                last;
> > +            }
> > +            else {
> > +                $response = '';
> > +            }
> > +        }
> > +
> > +        my $found = select( $rout = $rin, undef, undef, $time_left );
> > +        warn "select(): $!" if $found == -1;
> > +    }
> > +
> > +    return $response;
> > +}
> > +
> > +sub _send_packet {
> > +    my ( $self, $sock, $packet ) = @_;
> > +
> > +    $sock->blocking(1);
> > +    syswrite( $sock, $packet ) or warn "write to socket: $!";
> > +
> > +    return;
> > +}
> > +
> > +sub _get_connection_or_die {
> > +    my ($self) = @_;
> > +
> > +    # clear the socket if the PID changes
> > +    if ( defined $self->{'_sock_pid'} && $self->{'_sock_pid'} != $$ ) {
> > +        undef $self->{'_sock_pid'};
> > +        undef $self->{'_sock'};
> > +    }
> > +
> > +    $self->{'_sock_pid'} ||= $$;
> > +    $self->{'_sock'}     ||= IO::Socket::INET->new(
> > +        'PeerHost' => $self->{'_server_host'},
> > +        'PeerPort' => $self->{'_server_port'},
> > +        'Proto'    => 'udp'
> > +    ) or die "Cannot connect to $self->{'_server_host'}:$self->{'_server_port'}: $@ $!";
> > +
> > +    return $self->{'_sock'};
> > +}
> > +
> > +sub _sign_msg {
> > +    my ( $self, $msg_ref ) = @_;
> > +
> > +    $msg_ref->{'Sig'} = lc Digest::SHA::sha1_hex(
> > +        Digest::SHA::sha1( $self->_generate_packet_from_message($msg_ref) )
> > +    );
> > +
> > +    return 1;
> > +}
> > +
> > +sub _generate_packet_from_message {
> > +    my ( $self, $msg_ref ) = @_;
> > +
> > +    return join( "\n", map { "$_: $msg_ref->{$_}" } grep { length $msg_ref->{$_} } @hash_order );
> > +}
> > +
> > +sub _generate_thread_id {
> > +    my $RAND_MAX = 2**16;
> > +    my $val      = 0;
> > +    $val = int rand($RAND_MAX) while $val < 1024;
> > +    return $val;
> > +}
> > +
> > +sub _get_user_pass_hash_key {
> > +    my ($self) = @_;
> > +
> > +    return lc Digest::SHA::sha1_hex( $self->{'_username'} . ':' . $self->{'_password'} );
> > +}
> > +
> > +1;
> > diff --git a/lib/Mail/SpamAssassin/Pyzor/Digest.pm b/lib/Mail/SpamAssassin/Pyzor/Digest.pm
> > new file mode 100644
> > index 0000000..0e8a5ae
> > --- /dev/null
> > +++ b/lib/Mail/SpamAssassin/Pyzor/Digest.pm
> > @@ -0,0 +1,103 @@
> > +package Mail::SpamAssassin::Pyzor::Digest;
> > +
> > +# Copyright 2018 cPanel, LLC.
> > +# All rights reserved.
> > +# http://cpanel.net
> > +#
> > +# <@LICENSE>
> > +# Licensed to the Apache Software Foundation (ASF) under one or more
> > +# contributor license agreements.  See the NOTICE file distributed with
> > +# this work for additional information regarding copyright ownership.
> > +# The ASF licenses this file to you under the Apache License, Version 2.0
> > +# (the "License"); you may not use this file except in compliance with
> > +# the License.  You may obtain a copy of the License at:
> > +#
> > +#     http://www.apache.org/licenses/LICENSE-2.0
> > +#
> > +# Unless required by applicable law or agreed to in writing, software
> > +# distributed under the License is distributed on an "AS IS" BASIS,
> > +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> > +# See the License for the specific language governing permissions and
> > +# limitations under the License.
> > +# </...@LICENSE>
> > +#
> > +
> > +use strict;
> > +use warnings;
> > +
> > +=encoding utf-8
> > +
> > +=head1 NAME
> > +
> > +Mail::SpamAssassin::Pyzor::Digest
> > +
> > +=head1 SYNOPSIS
> > +
> > +    my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $mime_text );
> > +
> > +=head1 DESCRIPTION
> > +
> > +A reimplementation of L<https://github.com/SpamExperts/pyzor/blob/master/pyzor/digest.py>.
> > +
> > +=cut
> > +
> > +#----------------------------------------------------------------------
> > +
> > +use Email::MIME ();
> > +
> > +use Mail::SpamAssassin::Pyzor::Digest::Pieces ();
> > +use Digest::SHA qw(sha1_hex);
> > +
> > +our $VERSION = '0.03';
> > +
> > +#----------------------------------------------------------------------
> > +
> > +=head1 FUNCTIONS
> > +
> > +=head2 $hex = get( $MSG )
> > +
> > +This takes an email message in raw MIME text format (i.e., as saved in the
> > +standard mbox format) and returns the message???s Pyzor digest in lower-case
> > +hexadecimal.
> > +
> > +The output from this function should normally be identical to that of
> > +the C<pyzor> script???s C<digest> command. It is suitable for use in
> > +L<Mail::SpamAssassin::Pyzor::Client>???s request methods.
> > +
> > +=cut
> > +
> > +sub get {
> > +    my ($text) = @_;
> > +    return Digest::SHA::sha1_hex( ${ _get_predigest( $text ) } );
> > +}
> > +
> > +# NB: This is called from the test.
> > +sub _get_predigest {    ## no critic qw(RequireArgUnpacking)
> > +    my ($msg_text_sr) = @_;
> > +
> > +    my $parsed = Email::MIME->new($$msg_text_sr);
> > +
> > +    my @lines;
> > +
> > +    my $payloads_ar = Mail::SpamAssassin::Pyzor::Digest::Pieces::digest_payloads($parsed);
> > +
> > +    for my $payload (@$payloads_ar) {
> > +        my @p_lines = Mail::SpamAssassin::Pyzor::Digest::Pieces::splitlines($payload);
> > +        for my $line (@p_lines) {
> > +            Mail::SpamAssassin::Pyzor::Digest::Pieces::normalize($line);
> > +
> > +            next if !Mail::SpamAssassin::Pyzor::Digest::Pieces::should_handle_line($line);
> > +
> > +            # Make sure we have an octet string.
> > +            utf8::encode($line) if utf8::is_utf8($line);
> > +
> > +            push @lines, $line;
> > +        }
> > +    }
> > +
> > +    my $digest_sr = Mail::SpamAssassin::Pyzor::Digest::Pieces::assemble_lines( \@lines );
> > +
> > +    return $digest_sr;
> > +}
> > +
> > +1;
> > diff --git a/lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm b/lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm
> > new file mode 100644
> > index 0000000..522accd
> > --- /dev/null
> > +++ b/lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm
> > @@ -0,0 +1,301 @@
> > +package Mail::SpamAssassin::Pyzor::Digest::Pieces;
> > +
> > +# Copyright 2018 cPanel, LLC.
> > +# All rights reserved.
> > +# http://cpanel.net
> > +#
> > +# <@LICENSE>
> > +# Licensed to the Apache Software Foundation (ASF) under one or more
> > +# contributor license agreements.  See the NOTICE file distributed with
> > +# this work for additional information regarding copyright ownership.
> > +# The ASF licenses this file to you under the Apache License, Version 2.0
> > +# (the "License"); you may not use this file except in compliance with
> > +# the License.  You may obtain a copy of the License at:
> > +#
> > +#     http://www.apache.org/licenses/LICENSE-2.0
> > +#
> > +# Unless required by applicable law or agreed to in writing, software
> > +# distributed under the License is distributed on an "AS IS" BASIS,
> > +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> > +# See the License for the specific language governing permissions and
> > +# limitations under the License.
> > +# </...@LICENSE>
> > +#
> > +
> > +use strict;
> > +use warnings;
> > +
> > +=encoding utf-8
> > +
> > +=head1 NAME
> > +
> > +Mail::SpamAssassin::Pyzor::Digest::Pieces
> > +
> > +=head1 DESCRIPTION
> > +
> > +This module houses backend logic for L<Mail::SpamAssassin::Pyzor::Digest>.
> > +
> > +It reimplements logic found in pyzor???s F<digest.py> module
> > +(L<https://github.com/SpamExperts/pyzor/blob/master/pyzor/digest.py>).
> > +
> > +=cut
> > +
> > +#----------------------------------------------------------------------
> > +
> > +use Email::MIME::ContentType ();
> > +use Encode                   ();
> > +
> > +our $VERSION = '0.03';
> > +
> > +# each tuple is [ offset, length ]
> > +use constant _HASH_SPEC => ( [ 20, 3 ], [ 60, 3 ] );
> > +
> > +use constant {
> > +    _MIN_LINE_LENGTH => 8,
> > +
> > +    _ATOMIC_NUM_LINES => 4,
> > +};
> > +
> > +#----------------------------------------------------------------------
> > +
> > +=head1 FUNCTIONS
> > +
> > +=head2 $strings_ar = digest_payloads( $EMAIL_MIME )
> > +
> > +This imitates the corresponding object method in F<digest.py>.
> > +It returns a reference to an array of strings. Each string can be either
> > +a byte string or a character string (e.g., UTF-8 decoded).
> > +
> > +NB: RFC 2822 stipulates that message bodies should use CRLF
> > +line breaks, not plain LF (nor plain CR). L<Email::MIME::Encodings>
> > +will thus convert any plain CRs in a quoted-printable message
> > +body into CRLF. Python, though, doesn???t do this, so the output of
> > +our implementation of C<digest_payloads()> diverges from that of the Python
> > +original. It doesn???t ultimately make a difference since the line-ending
> > +whitespace gets trimmed regardless, but it???s necessary to factor in when
> > +comparing the output of our implementation with the Python output.
> > +
> > +=cut
> > +
> > +sub digest_payloads {
> > +    my ($parsed) = @_;
> > +
> > +    my @subparts = $parsed->subparts();
> > +
> > +    my @payloads;
> > +
> > +    if (@subparts) {
> > +        @payloads = map { @{ digest_payloads($_) } } $parsed->subparts();
> > +    }
> > +    else {
> > +        my ( $main_type, $subtype, $encoding, $encode_check ) = parse_content_type( $parsed->content_type() );
> > +
> > +        my $payload;
> > +
> > +        if ( $main_type eq 'text' ) {
> > +
> > +            # Decode transfer encoding, but leave us as a byte string.
> > +            # Note that this is where Email::MIME converts plain LF to CRLF.
> > +            $payload = $parsed->body();
> > +
> > +            # This does the actual character decoding (i.e., ???charset???).
> > +            $payload = Encode::decode( $encoding, $payload, $encode_check );
> > +
> > +            if ( $subtype eq 'html' ) {
> > +                require Mail::SpamAssassin::Pyzor::Digest::StripHtml;
> > +                $payload = Mail::SpamAssassin::Pyzor::Digest::StripHtml::strip($payload);
> > +            }
> > +        }
> > +        else {
> > +
> > +            # This does no decoding, even of, e.g., quoted-printable or base64.
> > +            $payload = $parsed->body_raw();
> > +        }
> > +
> > +        push @payloads, $payload;
> > +    }
> > +
> > +    return \@payloads;
> > +}
> > +
> > +#----------------------------------------------------------------------
> > +
> > +=head2 normalize( $STRING )
> > +
> > +This imitates the corresponding object method in F<digest.py>.
> > +It modifies C<$STRING> in-place.
> > +
> > +As with the original implementation, if C<$STRING> contains (decoded)
> > +Unicode characters, those characters will be parsed accordingly. So:
> > +
> > +    $str = "123\xc2\xa0";   # [ c2 a0 ] == \u00a0, non-breaking space
> > +
> > +    normalize($str);
> > +
> > +The above will leave C<$str> alone, but this:
> > +
> > +    utf8::decode($str);
> > +
> > +    normalize($str);
> > +
> > +??? will trim off the last two bytes from C<$str>.
> > +
> > +=cut
> > +
> > +sub normalize {    ## no critic qw( Subroutines::RequireArgUnpacking )
> > +
> > +    # NULs are bad, mm-kay?
> > +    $_[0] =~ tr<\0><>d;
> > +
> > +    # NB: Python???s \s without re.UNICODE is the same as Perl???s \s
> > +    # with the /a modifier.
> > +    #
> > +    # https://docs.python.org/2/library/re.html
> > +    # https://perldoc.perl.org/perlrecharclass.html#Backslash-sequences
> > +
> > +    # Python: re.compile(r'\S{10,}')
> > +    $_[0] =~ s<\S{10,}><>ag;
> > +
> > +    # Python: re.compile(r'\S+@\S+')
> > +    $_[0] =~ s<\S+ @ \S+><>agx;
> > +
> > +    # Python: re.compile(r'[a-z]+:\S+', re.IGNORECASE)
> > +    $_[0] =~ s<[a-zA-Z]+ : \S+><>agx;
> > +
> > +    # (from digest.py ???)
> > +    # Make sure we do the whitespace last because some of the previous
> > +    # patterns rely on whitespace.
> > +    $_[0] =~ tr< \x09-\x0d><>d;
> > +
> > +    # This is fun. digest.py???s normalize() does a non-UNICODE whitespace
> > +    # strip, then calls strip() on the string, which *will* strip Unicode
> > +    # whitespace from the ends.
> > +    $_[0] =~ s<\A\s+><>;
> > +    $_[0] =~ s<\s+\z><>;
> > +
> > +    return;
> > +}
> > +
> > +#----------------------------------------------------------------------
> > +
> > +=head2 $yn = should_handle_line( $STRING )
> > +
> > +This imitates the corresponding object method in F<digest.py>.
> > +It returns a boolean.
> > +
> > +=cut
> > +
> > +sub should_handle_line {
> > +    return $_[0] && length( $_[0] ) >= _MIN_LINE_LENGTH();
> > +}
> > +
> > +#----------------------------------------------------------------------
> > +
> > +=head2 $sr = assemble_lines( \@LINES )
> > +
> > +This assembles a string buffer out of @LINES. The string is the buffer
> > +of octets that will be hashed to produce the message digest.
> > +
> > +Each member of @LINES is expected to be an B<octet string>, not a
> > +character string.
> > +
> > +=cut
> > +
> > +sub assemble_lines {
> > +    my ($lines_ar) = @_;
> > +
> > +    if ( @$lines_ar <= _ATOMIC_NUM_LINES() ) {
> > +
> > +        # cf. handle_atomic() in digest.py
> > +        return \join( q<>, @$lines_ar );
> > +    }
> > +
> > +    #----------------------------------------------------------------------
> > +    # cf. handle_atomic() in digest.py
> > +
> > +    my $str = q<>;
> > +
> > +    for my $ofs_len ( _HASH_SPEC() ) {
> > +        my ( $offset, $length ) = @$ofs_len;
> > +
> > +        for my $i ( 0 .. ( $length - 1 ) ) {
> > +            my $idx = int( $offset * @$lines_ar / 100 ) + $i;
> > +
> > +            next if !defined $lines_ar->[$idx];
> > +
> > +            $str .= $lines_ar->[$idx];
> > +        }
> > +    }
> > +
> > +    return \$str;
> > +}
> > +
> > +#----------------------------------------------------------------------
> > +
> > +=head2 ($main, $sub, $encoding, $checkval) = parse_content_type( $CONTENT_TYPE )
> > +
> > +=cut
> > +
> > +use constant _QUOTED_PRINTABLE_NAMES => (
> > +    "quopri-codec",
> > +    "quopri",
> > +    "quoted-printable",
> > +    "quotedprintable",
> > +);
> > +
> > +# Make Encode::decode() ignore anything that doesn???t fit the
> > +# given encoding.
> > +use constant _encode_check_ignore => q<>;
> > +
> > +sub parse_content_type {
> > +    my ($content_type) = @_;
> > +
> > +    $Email::MIME::ContentType::STRICT_PARAMS = 0;
> > +    my $ct_parse = Email::MIME::ContentType::parse_content_type(
> > +        $content_type,
> > +    );
> > +
> > +    my $main = $ct_parse->{'type'}    || q<>;
> > +    my $sub  = $ct_parse->{'subtype'} || q<>;
> > +
> > +    my $encoding = $ct_parse->{'attributes'}{'charset'};
> > +
> > +    my $checkval;
> > +
> > +    if ($encoding) {
> > +
> > +        # Lower-case everything, convert underscore to dash, and remove NUL.
> > +        $encoding =~ tr<A-Z_\0><a-z->d;
> > +
> > +        # Apparently pyzor accommodates messages that put the transfer
> > +        # encoding in the Content-Type.
> > +        if ( grep { $_ eq $encoding } _QUOTED_PRINTABLE_NAMES() ) {
> > +            $checkval = Encode::FB_CROAK();
> > +        }
> > +    }
> > +    else {
> > +        $encoding = 'ascii';
> > +    }
> > +
> > +    # Match Python .decode()???s 'ignore' behavior
> > +    $checkval ||= \&_encode_check_ignore;
> > +
> > +    return ( $main, $sub, $encoding, $checkval );
> > +}
> > +
> > +#----------------------------------------------------------------------
> > +
> > +=head2 @lines = splitlines( $TEXT )
> > +
> > +Imitates C<str.splitlines()>. (cf. C<pydoc str>)
> > +
> > +Returns a plain list in list context. Returns the number of
> > +items to be returned in scalar context.
> > +
> > +=cut
> > +
> > +sub splitlines {
> > +    return split m<\r\n?|\n>, $_[0];
> > +}
> > +
> > +1;
> > diff --git a/lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm b/lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm
> > new file mode 100644
> > index 0000000..2617b4a
> > --- /dev/null
> > +++ b/lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm
> > @@ -0,0 +1,177 @@
> > +package Mail::SpamAssassin::Pyzor::Digest::StripHtml;
> > +
> > +# Copyright 2018 cPanel, LLC.
> > +# All rights reserved.
> > +# http://cpanel.net
> > +#
> > +# <@LICENSE>
> > +# Licensed to the Apache Software Foundation (ASF) under one or more
> > +# contributor license agreements.  See the NOTICE file distributed with
> > +# this work for additional information regarding copyright ownership.
> > +# The ASF licenses this file to you under the Apache License, Version 2.0
> > +# (the "License"); you may not use this file except in compliance with
> > +# the License.  You may obtain a copy of the License at:
> > +#
> > +#     http://www.apache.org/licenses/LICENSE-2.0
> > +#
> > +# Unless required by applicable law or agreed to in writing, software
> > +# distributed under the License is distributed on an "AS IS" BASIS,
> > +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> > +# See the License for the specific language governing permissions and
> > +# limitations under the License.
> > +# </...@LICENSE>
> > +#
> > +
> > +use strict;
> > +use warnings;
> > +
> > +=encoding utf-8
> > +
> > +=head1 NAME
> > +
> > +Mail::SpamAssassin::Pyzor::Digest::StripHtml
> > +
> > +=head1 SYNOPSIS
> > +
> > +    my $stripped = Mail::SpamAssassin::Pyzor::Digest::StripHtml::strip($html);
> > +
> > +=head1 DESCRIPTION
> > +
> > +This module attempts to duplicate pyzor???s HTML-stripping logic.
> > +
> > +=head1 ACCURACY
> > +
> > +This library cannot achieve 100%, bug-for-bug parity with pyzor
> > +because to do so would require duplicating Python???s own HTML parsing
> > +library. Since that library???s output has changed over time, and those
> > +changes in turn affect pyzor, it???s literally impossible to arrive at
> > +a single, fully-compatible reimplementation.
> > +
> > +That said, all known divergences between pyzor and this library involve
> > +invalid HTML as input.
> > +
> > +Please open bug reports for any divergences you identify, particularly
> > +if the input is valid HTML.
> > +
> > +=cut
> > +
> > +#----------------------------------------------------------------------
> > +
> > +use HTML::Parser ();
> > +
> > +our $VERSION = '0.03';
> > +
> > +#----------------------------------------------------------------------
> > +
> > +=head1 FUNCTIONS
> > +
> > +=head2 $stripped = strip( $HTML )
> > +
> > +Give it some HTML, and it???ll give back the stripped text.
> > +
> > +In B<general>, the stripping consists of removing tags as well as
> > +C<E<lt>scriptE<gt>> and C<E<lt>styleE<gt>> elements; however, it also
> > +removes HTML entities.
> > +
> > +This tries very hard to duplicate pyzor???s behavior with invalid HTML.
> > +
> > +=cut
> > +
> > +sub strip {
> > +    my ($html) = @_;
> > +
> > +    $html =~ s<\A\s+><>;
> > +    $html =~ s<\s+\z><>;
> > +
> > +    my $p = HTML::Parser->new( api_version => 3 );
> > +
> > +    my @pieces;
> > +
> > +    my $accumulate = 1;
> > +
> > +    $p->handler(
> > +        start => sub {
> > +            my ($tagname) = @_;
> > +
> > +            $accumulate = 0 if $tagname eq 'script';
> > +            $accumulate = 0 if $tagname eq 'style';
> > +
> > +            return;
> > +        },
> > +        'tagname',
> > +    );
> > +
> > +    $p->handler(
> > +        end => sub {
> > +            $accumulate = 1;
> > +            return;
> > +        }
> > +    );
> > +
> > +    $p->handler(
> > +        text => sub {
> > +            my ($copy) = @_;
> > +
> > +            return if !$accumulate;
> > +
> > +            # pyzor???s HTML parser discards HTML entities. On top of that,
> > +            # we need to match, as closely as possible, pyzor???s handling of
> > +            # invalid HTML entities ??? which is a function of Python???s
> > +            # standard HTML parsing library. This will probably never be
> > +            # fully compatible with the pyzor, but we can get it close.
> > +
> > +            # The original is:
> > +            #
> > +            #   re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
> > +            #
> > +            # The parsing loop then ???backs up??? one byte if the last
> > +            # character isn???t a ???;???. We use a look-ahead assertion to
> > +            # mimic that behavior.
> > +            $copy =~ s<\&\# (?:[0-9]+ | [xX][0-9a-fA-F]+) (?: ; | \z | (?=[^0-9a-fA-F]) )>< >gx;
> > +
> > +            # The original is:
> > +            #
> > +            #   re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
> > +            #
> > +            # We again use a look-ahead assertion to mimic Python.
> > +            $copy =~ s<\& [a-zA-Z] [-.a-zA-Z0-9]* (?: ; | \z | (?=[^a-zA-Z0-9]) )>< >gx;
> > +
> > +            # Python???s HTMLParser aborts its parsing loop when it encounters
> > +            # an invalid numeric reference.
> > +            $copy =~ s<\&\#
> > +                (?:
> > +                    [^0-9xX]        # anything but the expected first char
> > +                    |
> > +                    [0-9]+[a-fA-F]  # hex within decimal
> > +                    |
> > +                    [xX][^0-9a-fA-F]
> > +                )
> > +                (.*)
> > +            ><
> > +                ( -1 == index($1, ';') ) ? q<> : '&#'
> > +            >exs;
> > +
> > +            # Python???s HTMLParser treats invalid entities as incomplete
> > +            $copy =~ s<(\&\#?)><$1 >gx;
> > +
> > +            $copy =~ s<\A\s+><>;
> > +            $copy =~ s<\s+\z><>;
> > +
> > +            push @pieces, \$copy if length $copy;
> > +        },
> > +        'text,tagname',
> > +    );
> > +
> > +    $p->parse($html);
> > +    $p->eof();
> > +
> > +    my $payload = join( q< >, map { $$_ } @pieces );
> > +
> > +    # Convert all sequences of whitespace OTHER THAN non-breaking spaces to
> > +    # plain spaces.
> > +    $payload =~ s<[^\S\x{a0}]+>< >g;
> > +
> > +    return $payload;
> > +}
> > +
> > +1;
> > diff --git a/t/pyzor.t b/t/pyzor.t
> > index 891f38d..e4ef83f 100755
> > --- a/t/pyzor.t
> > +++ b/t/pyzor.t
> > @@ -3,12 +3,9 @@
> >  use lib '.'; use lib 't';
> >  use SATest; sa_t_init("pyzor");
> >  
> > -use constant HAS_PYZOR => eval { $_ = untaint_cmd("which pyzor"); chomp; -x };
> > -
> >  use Test::More;
> >  plan skip_all => "Net tests disabled" unless conf_bool('run_net_tests');
> > -plan skip_all => "Pyzor executable not found in path" unless HAS_PYZOR;
> > -plan tests => 8;
> > +plan tests => 5;
> >  
> >  diag('Note: Failures may not be an SpamAssassin bug, as Pyzor tests can fail due to problems with the Pyzor servers.');
> >  
> > @@ -30,7 +27,7 @@ tstprefs ("
> >  sarun ("-t < data/spam/pyzor", \&patterns_run_cb);
> >  ok_all_patterns();
> >  # Same with fork
> > -sarun ("--cf='pyzor_fork 1' -t < data/spam/pyzor", \&patterns_run_cb);
> > +sarun ("-t < data/spam/pyzor", \&patterns_run_cb);
> >  ok_all_patterns();
> >  
> >  #TESTING FOR HAM
> > @@ -44,7 +41,3 @@ ok_all_patterns();
> >  
> >  sarun ("-D pyzor -t < data/nice/001 2>&1", \&patterns_run_cb);
> >  ok_all_patterns();
> > -# same with fork
> > -sarun ("-D pyzor --cf='pyzor_fork 1' -t < data/nice/001 2>&1", \&patterns_run_cb);
> > -ok_all_patterns();
> > -
> 
> 

Re: new Pyzor implementation

Posted by Henrik K <he...@hege.li>.
If it's developed by cPanel in CPAN, then it should not be committed to SA,
unless it's clearly donated to SpamAssassin and removed from CPAN.  Assuming
we have developer resources and will to take it aboard.

As it is, Plugin/Pyzor.pm should have an option to choose which one to use,
as it makes no sense to ditch support for the widely installed original
Pyzor.


On Thu, Oct 14, 2021 at 04:15:13PM +0200, Giovanni Bechis wrote:
> Hi,
> cPanel has developed a native Perl Pyzor implementation for SpamAssassin
> and a diff against SpamAssassin 4.0 follows.
> Atm I am using it in production on a small server, more tests and
> opinions are welcome.
> 
> Original cPanel code is at https://metacpan.org/pod/Mail::Pyzor.
> 
>  Cheers
>   Giovanni
> 
> diff --git a/MANIFEST b/MANIFEST
> index 25d0192..2d9588c 100644
> --- a/MANIFEST
> +++ b/MANIFEST
> @@ -126,6 +126,11 @@ lib/Mail/SpamAssassin/Plugin/WLBLEval.pm
>  lib/Mail/SpamAssassin/Plugin/WhiteListSubject.pm
>  lib/Mail/SpamAssassin/PluginHandler.pm
>  lib/Mail/SpamAssassin/Plugin/URILocalBL.pm
> +lib/Mail/SpamAssassin/Pyzor/Client.pm
> +lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm
> +lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm
> +lib/Mail/SpamAssassin/Pyzor/Digest.pm
> +lib/Mail/SpamAssassin/Pyzor.pm
>  lib/Mail/SpamAssassin/RegistryBoundaries.pm
>  lib/Mail/SpamAssassin/Reporter.pm
>  lib/Mail/SpamAssassin/SQLBasedAddrList.pm
> diff --git a/lib/Mail/SpamAssassin/Plugin/Pyzor.pm b/lib/Mail/SpamAssassin/Plugin/Pyzor.pm
> index 3efd4b4..e4c9c05 100644
> --- a/lib/Mail/SpamAssassin/Plugin/Pyzor.pm
> +++ b/lib/Mail/SpamAssassin/Plugin/Pyzor.pm
> @@ -36,17 +36,13 @@ package Mail::SpamAssassin::Plugin::Pyzor;
>  
>  use Mail::SpamAssassin::Plugin;
>  use Mail::SpamAssassin::Logger;
> -use Mail::SpamAssassin::Timeout;
> -use Mail::SpamAssassin::Util qw(untaint_var untaint_file_path
> -                                proc_status_ok exit_status_str);
> +use Mail::SpamAssassin::Util qw(untaint_var);
> +
>  use strict;
>  use warnings;
>  # use bytes;
>  use re 'taint';
>  
> -use Storable;
> -use POSIX qw(PIPE_BUF WNOHANG _exit);
> -
>  our @ISA = qw(Mail::SpamAssassin::Plugin);
>  
>  sub new {
> @@ -78,7 +74,7 @@ sub set_config {
>    my ($self, $conf) = @_;
>    my @cmds;
>  
> -=head1 USER OPTIONS
> +=head1 ADMINISTRATOR OPTIONS
>  
>  =over 4
>  
> @@ -95,22 +91,7 @@ Whether to use Pyzor, if it is available.
>      type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL
>    });
>  
> -=item pyzor_fork (0|1)		(default: 0)
> -
> -Instead of running Pyzor synchronously, fork separate process for it and
> -read the results in later (similar to async DNS lookups).  Increases
> -throughput.  Experimental.
> -
> -=cut
> -
> -  push(@cmds, {
> -    setting => 'pyzor_fork',
> -    is_admin => 1,
> -    default => 0,
> -    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC,
> -  });
> -
> -=item pyzor_count_min NUMBER	(default: 5)
> +=item pyzor_count_min NUMBER		(default: 5)
>  
>  This option sets how often a message's body checksum must have been
>  reported to the Pyzor server before SpamAssassin will consider the Pyzor
> @@ -128,54 +109,8 @@ set this to a relatively low value, e.g. C<5>.
>      type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
>    });
>  
> -  # Deprecated setting, the name makes no sense!
> -  push (@cmds, {
> -    setting => 'pyzor_max',
> -    is_admin => 1,
> -    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC,
> -    code => sub {
> -      my ($self, $key, $value, $line) = @_;
> -      warn("deprecated setting used, change pyzor_max to pyzor_count_min\n");
> -      if ($value !~ /^\d+$/) {
> -        return $Mail::SpamAssassin::Conf::INVALID_VALUE;
> -      }
> -      $self->{pyzor_count_min} = $value;
> -    }
> -  });
> -
> -=item pyzor_whitelist_min NUMBER	(default: 10)
> -
> -This option sets how often a message's body checksum must have been
> -whitelisted to the Pyzor server for SpamAssassin to consider ignoring the
> -result.  Final decision is made by pyzor_whitelist_factor.
> -
> -=cut
> -
> -  push (@cmds, {
> -    setting => 'pyzor_whitelist_min',
> -    is_admin => 1,
> -    default => 10,
> -    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
> -  });
> -
> -=item pyzor_whitelist_factor NUMBER	(default: 0.2)
> -
> -Ignore Pyzor result if REPORTCOUNT x NUMBER >= pyzor_whitelist_min.
> -For default setting this means: 50 reports requires 10 whitelistings.
> -
> -=cut
> -
> -  push (@cmds, {
> -    setting => 'pyzor_whitelist_factor',
> -    is_admin => 1,
> -    default => 0.2,
> -    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
> -  });
> -
>  =back
>  
> -=head1 ADMINISTRATOR OPTIONS
> -
>  =over 4
>  
>  =item pyzor_timeout n		(default: 5)
> @@ -210,478 +145,182 @@ removing one of them.
>      type => $Mail::SpamAssassin::Conf::CONF_TYPE_DURATION
>    });
>  
> -=item pyzor_options options
> +=item pyzor_whitelist_min NUMBER        (default: 10)
>  
> -Specify additional options to the pyzor(1) command. Please note that only
> -characters in the range [0-9A-Za-z =,._/-] are allowed for security reasons.
> +This option sets how often a message's body checksum must have been
> +whitelisted to the Pyzor server for SpamAssassin to consider ignoring the
> +result.  Final decision is made by pyzor_whitelist_factor.
>  
>  =cut
>  
>    push (@cmds, {
> -    setting => 'pyzor_options',
> +    setting => 'pyzor_whitelist_min',
>      is_admin => 1,
> -    default => '',
> -    type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING,
> -    code => sub {
> -      my ($self, $key, $value, $line) = @_;
> -      if ($value !~ m{^([0-9A-Za-z =,._/-]+)$}) {
> -	return $Mail::SpamAssassin::Conf::INVALID_VALUE;
> -      }
> -      $self->{pyzor_options} = $1;
> -    }
> +    default => 10,
> +    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
>    });
>  
> -=item pyzor_path STRING
> +=item pyzor_whitelist_factor NUMBER     (default: 0.2)
>  
> -This option tells SpamAssassin specifically where to find the C<pyzor>
> -client instead of relying on SpamAssassin to find it in the current
> -PATH.  Note that if I<taint mode> is enabled in the Perl interpreter,
> -you should use this, as the current PATH will have been cleared.
> +Ignore Pyzor result if REPORTCOUNT x NUMBER >= pyzor_whitelist_min.
> +For default setting this means: 50 reports requires 10 whitelistings.
>  
>  =cut
>  
>    push (@cmds, {
> -    setting => 'pyzor_path',
> +    setting => 'pyzor_whitelist_factor',
>      is_admin => 1,
> -    default => undef,
> -    type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING,
> -    code => sub {
> -      my ($self, $key, $value, $line) = @_;
> -      if (!defined $value || !length $value) {
> -	return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE;
> -      }
> -      $value = untaint_file_path($value);
> -      if (!-x $value) {
> -	info("config: pyzor_path \"$value\" isn't an executable");
> -	return $Mail::SpamAssassin::Conf::INVALID_VALUE;
> -      }
> -
> -      $self->{pyzor_path} = $value;
> -    }
> +    default => 0.2,
> +    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
>    });
>  
>    $conf->{parser}->register_commands(\@cmds);
>  }
>  
>  sub is_pyzor_available {
> -  my ($self) = @_;
> +    my ($self) = @_;
>  
> -  my $pyzor = $self->{main}->{conf}->{pyzor_path} ||
> -    Mail::SpamAssassin::Util::find_executable_in_env_path('pyzor');
> -
> -  unless ($pyzor && -x $pyzor) {
> -    dbg("pyzor: no pyzor executable found");
> -    $self->{pyzor_available} = 0;
> -    return 0;
> -  }
> -
> -  # remember any found pyzor
> -  $self->{main}->{conf}->{pyzor_path} = $pyzor;
> -
> -  dbg("pyzor: pyzor is available: $pyzor");
> -  return 1;
> +    local $@;
> +    eval {
> +        require Mail::SpamAssassin::Pyzor::Digest;
> +        require Mail::SpamAssassin::Pyzor::Client;
> +    };
> +    return $@ ? 0 : 1;
>  }
>  
> -sub finish_parsing_start {
> -  my ($self, $opts) = @_;
> +sub get_pyzor_interface {
> +  my ($self) = @_;
>  
> -  # If forking, hard adjust priority -100 to launch early
> -  # Find rulenames from eval_to_rule mappings
> -  if ($opts->{conf}->{pyzor_fork}) {
> -    foreach (@{$opts->{conf}->{eval_to_rule}->{check_pyzor}}) {
> -      dbg("pyzor: adjusting rule $_ priority to -100");
> -      $opts->{conf}->{priority}->{$_} = -100;
> -    }
> +  if (!$self->{main}->{conf}->{use_pyzor}) {
> +    dbg("pyzor: use_pyzor option not enabled, disabling Pyzor");
> +    $self->{pyzor_interface} = "disabled";
> +    $self->{pyzor_available} = 0;
> +  }
> +  elsif ($self->is_pyzor_available()) {
> +    $self->{pyzor_interface} = "pyzor";
> +    $self->{pyzor_available} = 1;
> +  }
> +  else {
> +    dbg("pyzor: no pyzor found, disabling Pyzor");
> +    $self->{pyzor_available} = 0;
>    }
>  }
>  
>  sub check_pyzor {
> -  my ($self, $pms, $full) = @_;
> -
> -  return 0 if !$self->{pyzor_available};
> -  return 0 if !$self->{main}->{conf}->{use_pyzor};
> -
> -  return 0 if $pms->{pyzor_running};
> -  $pms->{pyzor_running} = 1;
> -
> -  return 0 if !$self->is_pyzor_available();
> -
> -  my $timer = $self->{main}->time_method("check_pyzor");
> +  my ($self, $permsgstatus, $full) = @_;
>  
>    # initialize valid tags
> -  $pms->{tag_data}->{PYZOR} = '';
> -
> -  # create fulltext tmpfile now (before possible forking)
> -  $pms->{pyzor_tmpfile} = $pms->create_fulltext_tmpfile();
> -
> -  ## non-forking method
> -
> -  if (!$self->{main}->{conf}->{pyzor_fork}) {
> -    my @results = $self->pyzor_lookup($pms);
> -    return $self->_check_result($pms, \@results);
> -  }
> -
> -  ## forking method
> -
> -  $pms->{pyzor_rulename} = $pms->get_current_eval_rule_name();
> -  $pms->rule_pending($pms->{pyzor_rulename}); # mark async
> -
> -  # create socketpair for communication
> -  $pms->{pyzor_backchannel} = Mail::SpamAssassin::SubProcBackChannel->new();
> -  my $back_selector = '';
> -  $pms->{pyzor_backchannel}->set_selector(\$back_selector);
> -  eval {
> -    $pms->{pyzor_backchannel}->setup_backchannel_parent_pre_fork();
> -  } or do {
> -    dbg("pyzor: backchannel pre-setup failed: $@");
> -    delete $pms->{pyzor_backchannel};
> -    return 0;
> -  };
> +  $permsgstatus->{tag_data}->{PYZOR} = "";
>  
> -  my $pid = fork();
> -  if (!defined $pid) {
> -    info("pyzor: child fork failed: $!");
> -    delete $pms->{pyzor_backchannel};
> -    return 0;
> -  }
> -  if (!$pid) {
> -    $0 = "$0 (pyzor)";
> -    $SIG{CHLD} = 'DEFAULT';
> -    $SIG{PIPE} = 'IGNORE';
> -    $SIG{$_} = sub {
> -      eval { dbg("pyzor: child process $$ caught signal $_[0]"); };
> -      _exit(6);  # avoid END and destructor processing
> -      kill('KILL',$$);  # still kicking? die!
> -      } foreach qw(INT HUP TERM TSTP QUIT USR1 USR2);
> -    dbg("pyzor: child process $$ forked");
> -    $pms->{pyzor_backchannel}->setup_backchannel_child_post_fork();
> -    my @results = $self->pyzor_lookup($pms);
> -    my $backmsg;
> -    eval {
> -      $backmsg = Storable::freeze(\@results);
> -    };
> -    if ($@) {
> -      dbg("pyzor: child return value freeze failed: $@");
> -      _exit(0); # avoid END and destructor processing
> -    }
> -    if (!syswrite($pms->{pyzor_backchannel}->{parent}, $backmsg)) {
> -      dbg("pyzor: child backchannel write failed: $!");
> -    }
> -    _exit(0); # avoid END and destructor processing
> -  }
> -
> -  $pms->{pyzor_pid} = $pid;
> +  my $timer = $self->{main}->time_method("check_pyzor");
>  
> -  eval {
> -    $pms->{pyzor_backchannel}->setup_backchannel_parent_post_fork($pid);
> -  } or do {
> -    dbg("pyzor: backchannel post-setup failed: $@");
> -    delete $pms->{pyzor_backchannel};
> -    return 0;
> -  };
> +  $self->get_pyzor_interface();
> +  return 0 unless $self->{pyzor_available};
>  
> -  return 0;
> +  return $self->pyzor_lookup($permsgstatus, $full);
>  }
>  
>  sub pyzor_lookup {
> -  my ($self, $pms) = @_;
> -
> -  my $conf = $self->{main}->{conf};
> -  my $timeout = $conf->{pyzor_timeout};
> -
> -  # note: not really tainted, this came from system configuration file
> -  my $path = untaint_file_path($conf->{pyzor_path});
> -  my $opts = untaint_var($conf->{pyzor_options}) || '';
> -
> -  $pms->enter_helper_run_mode();
> -
> -  my $pid;
> -  my @resp;
> -  my $timer = Mail::SpamAssassin::Timeout->new(
> -           { secs => $timeout, deadline => $pms->{master_deadline} });
> -  my $err = $timer->run_and_catch(sub {
> -    local $SIG{PIPE} = sub { die "__brokenpipe__ignore__\n" };
> -
> -    dbg("pyzor: opening pipe: ".
> -      join(' ', $path, $opts, "check", "<".$pms->{pyzor_tmpfile}));
> -
> -    $pid = Mail::SpamAssassin::Util::helper_app_pipe_open(*PYZOR,
> -	$pms->{pyzor_tmpfile}, 1, $path, split(' ', $opts), "check");
> -    $pid or die "$!\n";
> -
> -    # read+split avoids a Perl I/O bug (Bug 5985)
> -    my($inbuf, $nread);
> -    my $resp = '';
> -    while ($nread = read(PYZOR, $inbuf, 8192)) { $resp .= $inbuf }
> -    defined $nread  or die "error reading from pipe: $!";
> -    @resp = split(/^/m, $resp, -1);
> -
> -    my $errno = 0;
> -    close PYZOR or $errno = $!;
> -    if (proc_status_ok($?, $errno)) {
> -      dbg("pyzor: [%s] finished successfully", $pid);
> -    } elsif (proc_status_ok($?, $errno, 0, 1)) {  # sometimes it exits with 1
> -      dbg("pyzor: [%s] finished: %s", $pid, exit_status_str($?, $errno));
> -    } else {
> -      info("pyzor: [%s] error: %s", $pid, exit_status_str($?, $errno));
> -    }
> -
> -  });
> -
> -  if (defined(fileno(*PYZOR))) {  # still open
> -    if ($pid) {
> -      if (kill('TERM', $pid)) {
> -        dbg("pyzor: killed stale helper [$pid]");
> -      } else {
> -        dbg("pyzor: killing helper application [$pid] failed: $!");
> -      }
> -    }
> -    my $errno = 0;
> -    close PYZOR or $errno = $!;
> -    proc_status_ok($?, $errno)
> -      or info("pyzor: [%s] error: %s", $pid, exit_status_str($?, $errno));
> -  }
> -
> -  $pms->leave_helper_run_mode();
> -
> -  if ($timer->timed_out()) {
> -    dbg("pyzor: check timed out after $timeout seconds");
> -    return ();
> -  } elsif ($err) {
> -    chomp $err;
> -    info("pyzor: check failed: $err");
> -    return ();
> -  }
> -
> -  return @resp;
> -}
> -
> -sub check_tick {
> -  my ($self, $opts) = @_;
> -  $self->_check_forked_result($opts->{permsgstatus}, 0);
> -}
> -
> -sub check_cleanup {
> -  my ($self, $opts) = @_;
> -  $self->_check_forked_result($opts->{permsgstatus}, 1);
> -}
> -
> -sub _check_forked_result {
> -  my ($self, $pms, $finish) = @_;
> -
> -  return 0 if !$pms->{pyzor_backchannel};
> -  return 0 if !$pms->{pyzor_pid};
> +    my ( $self, $permsgstatus, $fulltext ) = @_;
> +    my $conf = $self->{main}->{conf};
> +    my $timeout = $conf->{pyzor_timeout};
> +
> +    my $client = ( $self->{'_pyzor_client'} ||= Mail::SpamAssassin::Pyzor::Client->new( 'timeout' => $timeout ) );
> +    my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $fulltext );
> +
> +    local $@;
> +    my $ref = eval { $client->check($digest); };
> +    dbg("pyzor: got response: $client->{'_server_host'}");
> +    # $client reply must be an hash
> +    return 0 if (not (ref $ref eq ref {}));
> +    if ($@) {
> +        my $err = $@;
>  
> -  my $timer = $self->{main}->time_method("check_pyzor");
> +        $err = eval { $err->get_message() } || $err;
>  
> -  $pms->{pyzor_abort} = $pms->{deadline_exceeded} || $pms->{shortcircuited};
> -
> -  my $kid_pid = $pms->{pyzor_pid};
> -  # if $finish, force waiting for the child
> -  my $pid = waitpid($kid_pid, $finish && !$pms->{pyzor_abort} ? 0 : WNOHANG);
> -  if ($pid == 0) {
> -    #dbg("pyzor: child process $kid_pid not finished yet, trying later");
> -    if ($pms->{pyzor_abort}) {
> -      dbg("pyzor: bailing out due to deadline/shortcircuit");
> -      kill('TERM', $kid_pid);
> -      if (waitpid($kid_pid, WNOHANG) == 0) {
> -        sleep(1);
> -        if (waitpid($kid_pid, WNOHANG) == 0) {
> -          dbg("pyzor: child process $kid_pid still alive, KILL");
> -          kill('KILL', $kid_pid);
> -          waitpid($kid_pid, 0);
> +        warn("pyzor: check failed: $err\n");
> +        return 0;
> +    } elsif ( defined $ref->{'Code'} and $ref->{'Code'} ne 200 ) {
> +        if(defined $ref->{'Code'} and defined $ref->{'Diag'}) {
> +          dbg("pyzor: check failed with invalid code: $ref->{'Code'}: $ref->{'Diag'}");
> +        } else {
> +          dbg("pyzor: check failed with undefined code");
>          }
> -      }
> -      delete $pms->{pyzor_pid};
> -      delete $pms->{pyzor_backchannel};
> +        return 0;
>      }
> -    return 0;
> -  } elsif ($pid == -1) {
> -    # child does not exist?
> -    dbg("pyzor: child process $kid_pid already handled?");
> -    delete $pms->{pyzor_backchannel};
> -    return 0;
> -  }
>  
> -  $pms->rule_ready($pms->{pyzor_rulename}); # mark rule ready for metas
> +    my $pyzor_count       = untaint_var($ref->{'Count'}) + 0;
> +    my $pyzor_whitelisted = untaint_var($ref->{'WL-Count'}) + 0;
> +    my $count_min = $conf->{pyzor_count_min};
> +    my $wl_min = $conf->{pyzor_whitelist_min};
>  
> -  dbg("pyzor: child process $kid_pid finished, reading results");
> +    my $wl_limit = $pyzor_whitelisted >= $wl_min ?
> +      $pyzor_count * $conf->{pyzor_whitelist_factor} : 0;
>  
> -  my $backmsg;
> -  my $ret = sysread($pms->{pyzor_backchannel}->{latest_kid_fh}, $backmsg, PIPE_BUF);
> -  if (!defined $ret || $ret == 0) {
> -    dbg("pyzor: could not read result from child: ".($ret == 0 ? 0 : $!));
> -    delete $pms->{pyzor_backchannel};
> -    return 0;
> -  }
> -
> -  delete $pms->{pyzor_backchannel};
> +    $permsgstatus->set_tag('PYZOR', "Reported $pyzor_count times, whitelisted $pyzor_whitelisted times.");
>  
> -  my $results;
> -  eval {
> -    $results = Storable::thaw($backmsg);
> -  };
> -  if ($@) {
> -    dbg("pyzor: child return value thaw failed: $@");
> -    return;
> -  }
> -
> -  $self->_check_result($pms, $results);
> -}
> +    dbg("pyzor: result: COUNT=$pyzor_count/$count_min WHITELIST=$pyzor_whitelisted/$wl_min/%.1f",
> +      $wl_limit);
>  
> -sub _check_result {
> -  my ($self, $pms, $results) = @_;
> -
> -  if (!@$results) {
> -    dbg("pyzor: no response from server");
> -    return 0;
> -  }
> -
> -  my $count = 0;
> -  my $count_wl = 0;
> -  foreach my $res (@$results) {
> -    chomp($res);
> -    if ($res =~ /^Traceback/) {
> -      info("pyzor: internal error, python traceback seen in response: $res");
> +    # Empty body etc results in same hash, we should skip very large numbers..
> +    if ($pyzor_count >= 1000000 || $pyzor_whitelisted >= 10000) {
> +      dbg("pyzor: result exceeded hardcoded limits, ignoring: count/wl 1000000/10000");
>        return 0;
>      }
> -    dbg("pyzor: got response: $res");
> -    # this regexp is intended to be a little bit forgiving
> -    if ($res =~ /^\S+\t.*?\t(\d+)\t(\d+)\s*$/) {
> -      # until pyzor servers can sync their DBs,
> -      # sum counts obtained from all servers
> -      $count += untaint_var($1)+0; # crazy but needs untainting
> -      $count_wl += untaint_var($2)+0;
> -    } else {
> -      # warn on failures to parse
> -      info("pyzor: failure to parse response \"$res\"");
> -    }
> -  }
> -
> -  my $conf = $self->{main}->{conf};
> -
> -  my $count_min = $conf->{pyzor_count_min};
> -  my $wl_min = $conf->{pyzor_whitelist_min};
>  
> -  my $wl_limit = $count_wl >= $wl_min ?
> -    $count * $conf->{pyzor_whitelist_factor} : 0;
> -
> -  dbg("pyzor: result: COUNT=$count/$count_min WHITELIST=$count_wl/$wl_min/%.1f",
> -    $wl_limit);
> -  $pms->set_tag('PYZOR', "Reported $count times, whitelisted $count_wl times.");
> -
> -  # Empty body etc results in same hash, we should skip very large numbers..
> -  if ($count >= 1000000 || $count_wl >= 10000) {
> -    dbg("pyzor: result exceeded hardcoded limits, ignoring: count/wl 1000000/10000");
> -    return 0;
> -  }
> -
> -  # Whitelisted?
> -  if ($wl_limit && $count_wl >= $wl_limit) {
> -    dbg("pyzor: message whitelisted");
> -    return 0;
> -  }
> +    # Whitelisted?
> +    if ($wl_limit && $pyzor_whitelisted >= $wl_limit) {
> +      dbg("pyzor: message whitelisted");
> +      return 0;
> +    }
>  
> -  if ($count >= $count_min) {
> -    if ($conf->{pyzor_fork}) {
> -      # forked needs to run got_hit()
> -      $pms->got_hit($pms->{pyzor_rulename}, "", ruletype => 'eval');
> +    if ( $pyzor_count >= $count_min ) {
> +      return 1;
>      }
> -    return 1;
> -  }
>  
> -  return 0;
> +    return 0;
>  }
>  
>  sub plugin_report {
>    my ($self, $options) = @_;
>  
> -  return if !$self->{pyzor_available};
> -  return if !$self->{main}->{conf}->{use_pyzor};
> -  return if $options->{report}->{options}->{dont_report_to_pyzor};
> -  return if !$self->is_pyzor_available();
> -
> -  # use temporary file: open2() is unreliable due to buffering under spamd
> -  my $tmpf = $options->{report}->create_fulltext_tmpfile($options->{text});
> -  if ($self->pyzor_report($options, $tmpf)) {
> -    $options->{report}->{report_available} = 1;
> -    info("reporter: spam reported to Pyzor");
> -    $options->{report}->{report_return} = 1;
> -  }
> -  else {
> -    info("reporter: could not report spam to Pyzor");
> -  }
> -  $options->{report}->delete_fulltext_tmpfile($tmpf);
> +  return unless $self->{pyzor_available};
> +  return unless $self->{main}->{conf}->{use_pyzor};
>  
> -  return 1;
> +  if (!$options->{report}->{options}->{dont_report_to_pyzor} && $self->is_pyzor_available())
> +  {
> +    if ($self->pyzor_report($options)) {
> +      $options->{report}->{report_available} = 1;
> +      info("reporter: spam reported to Pyzor");
> +      $options->{report}->{report_return} = 1;
> +    }
> +    else {
> +      info("reporter: could not report spam to Pyzor");
> +    }
> +  }
>  }
>  
>  sub pyzor_report {
> -  my ($self, $options, $tmpf) = @_;
> -
> -  # note: not really tainted, this came from system configuration file
> -  my $path = untaint_file_path($options->{report}->{conf}->{pyzor_path});
> -  my $opts = untaint_var($options->{report}->{conf}->{pyzor_options}) || '';
> +    my ( $self, $options ) = @_;
>  
> -  my $timeout = $self->{main}->{conf}->{pyzor_timeout};
> +    my $timeout = $self->{main}->{conf}->{pyzor_timeout};
>  
> -  $options->{report}->enter_helper_run_mode();
> +    my $client = ( $self->{'_pyzor_client'} ||= Mail::SpamAssassin::Pyzor::Client->new( 'timeout' => $timeout ) );
>  
> -  my $timer = Mail::SpamAssassin::Timeout->new({ secs => $timeout });
> -  my $err = $timer->run_and_catch(sub {
> +    my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $options->{'text'} );
>  
> -    local $SIG{PIPE} = sub { die "__brokenpipe__ignore__\n" };
> -
> -    dbg("pyzor: opening pipe: " . join(' ', $path, $opts, "report", "< $tmpf"));
> -
> -    my $pid = Mail::SpamAssassin::Util::helper_app_pipe_open(*PYZOR,
> -	$tmpf, 1, $path, split(' ', $opts), "report");
> -    $pid or die "$!\n";
> -
> -    my($inbuf,$nread,$nread_all); $nread_all = 0;
> -    # response is ignored, just check its existence
> -    while ( $nread=read(PYZOR,$inbuf,8192) ) { $nread_all += $nread }
> -    defined $nread  or die "error reading from pipe: $!";
> -
> -    dbg("pyzor: empty response")  if $nread_all < 1;
> -
> -    my $errno = 0;  close PYZOR or $errno = $!;
> -    # closing a pipe also waits for the process executing on the pipe to
> -    # complete, no need to explicitly call waitpid
> -    # my $child_stat = waitpid($pid,0) > 0 ? $? : undef;
> -    if (proc_status_ok($?,$errno, 0)) {
> -      dbg("pyzor: [%s] reporter finished successfully", $pid);
> -    } else {
> -      info("pyzor: [%s] reporter error: %s", $pid, exit_status_str($?,$errno));
> +    local $@;
> +    my $ref = eval { $client->report($digest); };
> +    if ($@) {
> +        warn("pyzor: report failed: $@");
> +        return 0;
>      }
> -
> -  });
> -
> -  $options->{report}->leave_helper_run_mode();
> -
> -  if ($timer->timed_out()) {
> -    dbg("reporter: pyzor report timed out after $timeout seconds");
> -    return 0;
> -  }
> -
> -  if ($err) {
> -    chomp $err;
> -    if ($err eq '__brokenpipe__ignore__') {
> -      dbg("reporter: pyzor report failed: broken pipe");
> -    } else {
> -      warn("reporter: pyzor report failed: $err\n");
> +    elsif ( $ref->{'Code'} ne 200 ) {
> +        dbg("pyzor: report failed with invalid code: $ref->{'Code'}: $ref->{'Diag'}");
> +        return 0;
>      }
> -    return 0;
> -  }
>  
> -  return 1;
> +    return 1;
>  }
>  
> -# Version features
> -sub has_fork { 1 }
> -
>  1;
> -
> -=back
> -
> -=cut
> diff --git a/lib/Mail/SpamAssassin/Pyzor.pm b/lib/Mail/SpamAssassin/Pyzor.pm
> new file mode 100644
> index 0000000..8ac27f4
> --- /dev/null
> +++ b/lib/Mail/SpamAssassin/Pyzor.pm
> @@ -0,0 +1,56 @@
> +package Mail::SpamAssassin::Pyzor;
> +
> +# Copyright 2018 cPanel, LLC.
> +# All rights reserved.
> +# http://cpanel.net
> +#
> +# <@LICENSE>
> +# Licensed to the Apache Software Foundation (ASF) under one or more
> +# contributor license agreements.  See the NOTICE file distributed with
> +# this work for additional information regarding copyright ownership.
> +# The ASF licenses this file to you under the Apache License, Version 2.0
> +# (the "License"); you may not use this file except in compliance with
> +# the License.  You may obtain a copy of the License at:
> +#
> +#     http://www.apache.org/licenses/LICENSE-2.0
> +#
> +# Unless required by applicable law or agreed to in writing, software
> +# distributed under the License is distributed on an "AS IS" BASIS,
> +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> +# See the License for the specific language governing permissions and
> +# limitations under the License.
> +# </...@LICENSE>
> +#
> +
> +use strict;
> +use warnings;
> +
> +our $VERSION = '0.06_01';
> +
> +=encoding utf-8
> +
> +=head1 NAME
> +
> +Mail::SpamAssassin::Pyzor - Pyzor spam filtering in Perl
> +
> +=head1 DESCRIPTION
> +
> +This distribution contains Perl implementations of parts of
> +L<Pyzor|http://pyzor.org>, a tool for use in spam email filtering.
> +It is intended for use with L<Mail::SpamAssassin> but may be useful
> +in other contexts.
> +
> +See the following modules for information on specific tools that
> +the distribution includes:
> +
> +=over
> +
> +=item * L<Mail::SpamAssassin::Pyzor::Client>
> +
> +=item * L<Mail::SpamAssassin::Pyzor::Digest>
> +
> +=back
> +
> +=cut
> +
> +1;
> diff --git a/lib/Mail/SpamAssassin/Pyzor/Client.pm b/lib/Mail/SpamAssassin/Pyzor/Client.pm
> new file mode 100644
> index 0000000..ccff868
> --- /dev/null
> +++ b/lib/Mail/SpamAssassin/Pyzor/Client.pm
> @@ -0,0 +1,415 @@
> +package Mail::SpamAssassin::Pyzor::Client;
> +
> +# Copyright 2018 cPanel, LLC.
> +# All rights reserved.
> +# http://cpanel.net
> +#
> +# <@LICENSE>
> +# Licensed to the Apache Software Foundation (ASF) under one or more
> +# contributor license agreements.  See the NOTICE file distributed with
> +# this work for additional information regarding copyright ownership.
> +# The ASF licenses this file to you under the Apache License, Version 2.0
> +# (the "License"); you may not use this file except in compliance with
> +# the License.  You may obtain a copy of the License at:
> +#
> +#     http://www.apache.org/licenses/LICENSE-2.0
> +#
> +# Unless required by applicable law or agreed to in writing, software
> +# distributed under the License is distributed on an "AS IS" BASIS,
> +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> +# See the License for the specific language governing permissions and
> +# limitations under the License.
> +# </...@LICENSE>
> +#
> +
> +use strict;
> +use warnings;
> +
> +=encoding utf-8
> +
> +=head1 NAME
> +
> +Mail::SpamAssassin::Pyzor::Client - Pyzor client logic
> +
> +=head1 SYNOPSIS
> +
> +    use Mail::SpamAssassin::Pyzor::Client ();
> +    use Mail::SpamAssassin::Pyzor::Digest ();
> +
> +    my $client = Mail::SpamAssassin::Pyzor::Client->new();
> +
> +    my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $msg );
> +
> +    my $check_ref = $client->check($digest);
> +    die $check_ref->{'Diag'} if $check_ref->{'Code'} ne '200';
> +
> +    my $report_ref = $client->report($digest);
> +    die $report_ref->{'Diag'} if $report_ref->{'Code'} ne '200';
> +
> +=head1 DESCRIPTION
> +
> +A bare-bones L<Pyzor|http://pyzor.org> client that currently only
> +implements the functionality needed for L<Mail::SpamAssassin>.
> +
> +=head1 PROTOCOL DETAILS
> +
> +The Pyzor protocol is not a published standard, and there appears to be
> +no meaningful public documentation. What follows is enough information,
> +largely gleaned through forum posts and reverse engineering, to facilitate
> +effective use of this module:
> +
> +Pyzor is an RPC-oriented, message-based protocol. Each message
> +is a simple dictionary of 7-bit ASCII keys and values. Server responses
> +always include at least the following:
> +
> +=over
> +
> +=item * C<Code> - Similar to HTTP status codes; anything besides C<200>
> +is an error.
> +
> +=item * C<Diag> - Similar to HTTP status reasons: a text description
> +of the status.
> +
> +=back
> +
> +(NB: There are additional standard response headers that are useful only for
> +the protocol itself and thus are not part of this module???s returns.)
> +
> +=head2 Reliability
> +
> +Pyzor uses UDP rather than TCP, so no message is guaranteed to reach its
> +destination. A transmission failure can happen in either the request or
> +the response; in either case, a timeout error will result. Such errors
> +are represented as thrown instances of L<Mail::Pyzor::X::Timeout>.
> +
> +=cut
> +
> +#----------------------------------------------------------------------
> +
> +our $VERSION = '0.04';
> +
> +our $DEFAULT_SERVER_HOST    = 'public.pyzor.org';
> +our $DEFAULT_SERVER_PORT    = 24441;
> +our $DEFAULT_USERNAME       = 'anonymous';
> +our $DEFAULT_PASSWORD       = '';
> +our $DEFAULT_OP_SPEC        = '20,3,60,3';
> +our $PYZOR_PROTOCOL_VERSION = 2.1;
> +our $DEFAULT_TIMEOUT        = 3.5;
> +our $READ_SIZE              = 8192;
> +
> +use IO::Socket::INET ();
> +use Digest::SHA qw(sha1 sha1_hex);
> +
> +my @hash_order = ( 'Op', 'Op-Digest', 'Op-Spec', 'Thread', 'PV', 'User', 'Time', 'Sig' );
> +
> +#----------------------------------------------------------------------
> +
> +=head1 CONSTRUCTOR
> +
> +=head2 new(%OPTS)
> +
> +Create a new pyzor client.
> +
> +=over 2
> +
> +=item Input
> +
> +%OPTS are (all optional):
> +
> +=over 3
> +
> +=item * C<server_host> - The pyzor server host to connect to (default is
> +C<public.pyzor.org>)
> +
> +=item * C<server_port> - The pyzor server port to connect to (default is
> +24441)
> +
> +=item * C<username> - The username to present to the pyzor server (default
> +is C<anonymous>)
> +
> +=item * C<password> - The password to present to the pyzor server (default
> +is empty)
> +
> +=item * C<timeout> - The maximum time, in seconds, to wait for a response
> +from the pyzor server (defeault is 3.5)
> +
> +=back
> +
> +=item Output
> +
> +=over 3
> +
> +Returns a L<Mail::SpamAssassin::Pyzor::Client> object.
> +
> +=back
> +
> +=back
> +
> +=cut
> +
> +sub new {
> +    my ( $class, %OPTS ) = @_;
> +
> +    return bless {
> +        '_server_host' => $OPTS{'server_host'} || $DEFAULT_SERVER_HOST,
> +        '_server_port' => $OPTS{'server_port'} || $DEFAULT_SERVER_PORT,
> +        '_username'    => $OPTS{'username'}    || $DEFAULT_USERNAME,
> +        '_password'    => $OPTS{'password'}    || $DEFAULT_PASSWORD,
> +        '_op_spec'     => $DEFAULT_OP_SPEC,
> +        '_timeout'     => $OPTS{'timeout'} || $DEFAULT_TIMEOUT,
> +    }, $class;
> +}
> +
> +#----------------------------------------------------------------------
> +
> +=head1 REQUEST METHODS
> +
> +=head2 report($digest)
> +
> +Report the digest of a spam message to the pyzor server. This function
> +will throw if a messaging failure or timeout happens.
> +
> +=over 2
> +
> +=item Input
> +
> +=over 3
> +
> +=item $digest C<SCALAR>
> +
> +The message digest to report, as given by
> +C<Mail::SpamAssassin::Pyzor::Digest::get()>.
> +
> +=back
> +
> +=item Output
> +
> +=over 3
> +
> +=item C<HASHREF>
> +
> +Returns a hashref of the standard attributes noted above.
> +
> +=back
> +
> +=back
> +
> +=cut
> +
> +sub report {
> +    my ( $self, $digest ) = @_;
> +
> +    my $msg_ref = $self->_get_base_msg( 'report', $digest );
> +
> +    $msg_ref->{'Op-Spec'} = $self->{'_op_spec'};
> +
> +    return $self->_send_receive_msg($msg_ref);
> +}
> +
> +=head2 check($digest)
> +
> +Check the digest of a message to see if
> +the pyzor server has a report for it. This function
> +will throw if a messaging failure or timeout happens.
> +
> +=over 2
> +
> +=item Input
> +
> +=over 3
> +
> +=item $digest C<SCALAR>
> +
> +The message digest to check, as given by
> +C<Mail::SpamAssassin::Pyzor::Digest::get()>.
> +
> +=back
> +
> +=item Output
> +
> +=over 3
> +
> +=item C<HASHREF>
> +
> +Returns a hashref of the standard attributes noted above
> +as well as the following:
> +
> +=over
> +
> +=item * C<Count> - The number of reports the server has received
> +for the given digest.
> +
> +=item * C<WL-Count> - The number of whitelist requests the server has received
> +for the given digest.
> +
> +=back
> +
> +=back
> +
> +=back
> +
> +=cut
> +
> +sub check {
> +    my ( $self, $digest ) = @_;
> +
> +    return $self->_send_receive_msg( $self->_get_base_msg( 'check', $digest ) );
> +}
> +
> +# ----------------------------------------
> +
> +sub _send_receive_msg {
> +    my ( $self, $msg_ref ) = @_;
> +
> +    my $thread_id = $msg_ref->{'Thread'} or warn 'No thread ID?';
> +
> +    $self->_sign_msg($msg_ref);
> +
> +    return $self->_do_send_receive(
> +        $self->_generate_packet_from_message($msg_ref) . "\n\n",
> +        $thread_id,
> +    );
> +}
> +
> +sub _get_base_msg {
> +    my ( $self, $op, $digest ) = @_;
> +
> +    die "Implementor error: op is required" if !$op;
> +    die "error: digest is required"         if !$digest;
> +
> +    return {
> +        'User'      => $self->{'_username'},
> +        'PV'        => $PYZOR_PROTOCOL_VERSION,
> +        'Time'      => time(),
> +        'Op'        => $op,
> +        'Op-Digest' => $digest,
> +        'Thread'    => $self->_generate_thread_id()
> +    };
> +}
> +
> +sub _do_send_receive {
> +    my ( $self, $packet, $thread_id ) = @_;
> +
> +    my $sock = $self->_get_connection_or_die();
> +
> +    $self->_send_packet( $sock, $packet );
> +    my $response = $self->_receive_packet( $sock, $thread_id );
> +
> +    return 0 if not defined $response;
> +
> +    my $resp_hr = { map { ( split(m{: }) )[ 0, 1 ] } split( m{\n}, $response ) };
> +
> +    delete $resp_hr->{'Thread'};
> +
> +    my $response_pv = delete $resp_hr->{'PV'};
> +
> +    if ( $PYZOR_PROTOCOL_VERSION ne $response_pv ) {
> +        warn "Unexpected protocol version ($response_pv) in Pyzor response!";
> +    }
> +
> +    return $resp_hr;
> +}
> +
> +sub _receive_packet {
> +    my ( $self, $sock, $thread_id ) = @_;
> +
> +    my $timeout = $self->{'_timeout'} * 1000;
> +
> +    my $end_time = time + $self->{'_timeout'};
> +
> +    $sock->blocking(0);
> +    my $response = '';
> +    my $rout     = '';
> +    my $rin      = '';
> +    vec( $rin, fileno($sock), 1 ) = 1;
> +
> +    while (1) {
> +        my $time_left = $end_time - time;
> +
> +        if ( $time_left <= 0 ) {
> +          warn("Did not receive a response from the pyzor server $self->{'_server_host'}:$self->{'_server_port'} for $self->{'_timeout'} seconds!");
> +          return;
> +        }
> +
> +        my $bytes = sysread( $sock, $response, $READ_SIZE, length $response );
> +        if ( !defined($bytes) && !$!{'EAGAIN'} && !$!{'EWOULDBLOCK'} ) {
> +            warn "read from socket: $!";
> +        }
> +
> +        if ( index( $response, "\n\n" ) > -1 ) {
> +
> +            # Reject the response unless its thread ID matches what we sent.
> +            # This prevents confusion among concurrent Pyzor reqeusts.
> +            if ( index( $response, "\nThread: $thread_id\n" ) != -1 ) {
> +                last;
> +            }
> +            else {
> +                $response = '';
> +            }
> +        }
> +
> +        my $found = select( $rout = $rin, undef, undef, $time_left );
> +        warn "select(): $!" if $found == -1;
> +    }
> +
> +    return $response;
> +}
> +
> +sub _send_packet {
> +    my ( $self, $sock, $packet ) = @_;
> +
> +    $sock->blocking(1);
> +    syswrite( $sock, $packet ) or warn "write to socket: $!";
> +
> +    return;
> +}
> +
> +sub _get_connection_or_die {
> +    my ($self) = @_;
> +
> +    # clear the socket if the PID changes
> +    if ( defined $self->{'_sock_pid'} && $self->{'_sock_pid'} != $$ ) {
> +        undef $self->{'_sock_pid'};
> +        undef $self->{'_sock'};
> +    }
> +
> +    $self->{'_sock_pid'} ||= $$;
> +    $self->{'_sock'}     ||= IO::Socket::INET->new(
> +        'PeerHost' => $self->{'_server_host'},
> +        'PeerPort' => $self->{'_server_port'},
> +        'Proto'    => 'udp'
> +    ) or die "Cannot connect to $self->{'_server_host'}:$self->{'_server_port'}: $@ $!";
> +
> +    return $self->{'_sock'};
> +}
> +
> +sub _sign_msg {
> +    my ( $self, $msg_ref ) = @_;
> +
> +    $msg_ref->{'Sig'} = lc Digest::SHA::sha1_hex(
> +        Digest::SHA::sha1( $self->_generate_packet_from_message($msg_ref) )
> +    );
> +
> +    return 1;
> +}
> +
> +sub _generate_packet_from_message {
> +    my ( $self, $msg_ref ) = @_;
> +
> +    return join( "\n", map { "$_: $msg_ref->{$_}" } grep { length $msg_ref->{$_} } @hash_order );
> +}
> +
> +sub _generate_thread_id {
> +    my $RAND_MAX = 2**16;
> +    my $val      = 0;
> +    $val = int rand($RAND_MAX) while $val < 1024;
> +    return $val;
> +}
> +
> +sub _get_user_pass_hash_key {
> +    my ($self) = @_;
> +
> +    return lc Digest::SHA::sha1_hex( $self->{'_username'} . ':' . $self->{'_password'} );
> +}
> +
> +1;
> diff --git a/lib/Mail/SpamAssassin/Pyzor/Digest.pm b/lib/Mail/SpamAssassin/Pyzor/Digest.pm
> new file mode 100644
> index 0000000..0e8a5ae
> --- /dev/null
> +++ b/lib/Mail/SpamAssassin/Pyzor/Digest.pm
> @@ -0,0 +1,103 @@
> +package Mail::SpamAssassin::Pyzor::Digest;
> +
> +# Copyright 2018 cPanel, LLC.
> +# All rights reserved.
> +# http://cpanel.net
> +#
> +# <@LICENSE>
> +# Licensed to the Apache Software Foundation (ASF) under one or more
> +# contributor license agreements.  See the NOTICE file distributed with
> +# this work for additional information regarding copyright ownership.
> +# The ASF licenses this file to you under the Apache License, Version 2.0
> +# (the "License"); you may not use this file except in compliance with
> +# the License.  You may obtain a copy of the License at:
> +#
> +#     http://www.apache.org/licenses/LICENSE-2.0
> +#
> +# Unless required by applicable law or agreed to in writing, software
> +# distributed under the License is distributed on an "AS IS" BASIS,
> +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> +# See the License for the specific language governing permissions and
> +# limitations under the License.
> +# </...@LICENSE>
> +#
> +
> +use strict;
> +use warnings;
> +
> +=encoding utf-8
> +
> +=head1 NAME
> +
> +Mail::SpamAssassin::Pyzor::Digest
> +
> +=head1 SYNOPSIS
> +
> +    my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $mime_text );
> +
> +=head1 DESCRIPTION
> +
> +A reimplementation of L<https://github.com/SpamExperts/pyzor/blob/master/pyzor/digest.py>.
> +
> +=cut
> +
> +#----------------------------------------------------------------------
> +
> +use Email::MIME ();
> +
> +use Mail::SpamAssassin::Pyzor::Digest::Pieces ();
> +use Digest::SHA qw(sha1_hex);
> +
> +our $VERSION = '0.03';
> +
> +#----------------------------------------------------------------------
> +
> +=head1 FUNCTIONS
> +
> +=head2 $hex = get( $MSG )
> +
> +This takes an email message in raw MIME text format (i.e., as saved in the
> +standard mbox format) and returns the message???s Pyzor digest in lower-case
> +hexadecimal.
> +
> +The output from this function should normally be identical to that of
> +the C<pyzor> script???s C<digest> command. It is suitable for use in
> +L<Mail::SpamAssassin::Pyzor::Client>???s request methods.
> +
> +=cut
> +
> +sub get {
> +    my ($text) = @_;
> +    return Digest::SHA::sha1_hex( ${ _get_predigest( $text ) } );
> +}
> +
> +# NB: This is called from the test.
> +sub _get_predigest {    ## no critic qw(RequireArgUnpacking)
> +    my ($msg_text_sr) = @_;
> +
> +    my $parsed = Email::MIME->new($$msg_text_sr);
> +
> +    my @lines;
> +
> +    my $payloads_ar = Mail::SpamAssassin::Pyzor::Digest::Pieces::digest_payloads($parsed);
> +
> +    for my $payload (@$payloads_ar) {
> +        my @p_lines = Mail::SpamAssassin::Pyzor::Digest::Pieces::splitlines($payload);
> +        for my $line (@p_lines) {
> +            Mail::SpamAssassin::Pyzor::Digest::Pieces::normalize($line);
> +
> +            next if !Mail::SpamAssassin::Pyzor::Digest::Pieces::should_handle_line($line);
> +
> +            # Make sure we have an octet string.
> +            utf8::encode($line) if utf8::is_utf8($line);
> +
> +            push @lines, $line;
> +        }
> +    }
> +
> +    my $digest_sr = Mail::SpamAssassin::Pyzor::Digest::Pieces::assemble_lines( \@lines );
> +
> +    return $digest_sr;
> +}
> +
> +1;
> diff --git a/lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm b/lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm
> new file mode 100644
> index 0000000..522accd
> --- /dev/null
> +++ b/lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm
> @@ -0,0 +1,301 @@
> +package Mail::SpamAssassin::Pyzor::Digest::Pieces;
> +
> +# Copyright 2018 cPanel, LLC.
> +# All rights reserved.
> +# http://cpanel.net
> +#
> +# <@LICENSE>
> +# Licensed to the Apache Software Foundation (ASF) under one or more
> +# contributor license agreements.  See the NOTICE file distributed with
> +# this work for additional information regarding copyright ownership.
> +# The ASF licenses this file to you under the Apache License, Version 2.0
> +# (the "License"); you may not use this file except in compliance with
> +# the License.  You may obtain a copy of the License at:
> +#
> +#     http://www.apache.org/licenses/LICENSE-2.0
> +#
> +# Unless required by applicable law or agreed to in writing, software
> +# distributed under the License is distributed on an "AS IS" BASIS,
> +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> +# See the License for the specific language governing permissions and
> +# limitations under the License.
> +# </...@LICENSE>
> +#
> +
> +use strict;
> +use warnings;
> +
> +=encoding utf-8
> +
> +=head1 NAME
> +
> +Mail::SpamAssassin::Pyzor::Digest::Pieces
> +
> +=head1 DESCRIPTION
> +
> +This module houses backend logic for L<Mail::SpamAssassin::Pyzor::Digest>.
> +
> +It reimplements logic found in pyzor???s F<digest.py> module
> +(L<https://github.com/SpamExperts/pyzor/blob/master/pyzor/digest.py>).
> +
> +=cut
> +
> +#----------------------------------------------------------------------
> +
> +use Email::MIME::ContentType ();
> +use Encode                   ();
> +
> +our $VERSION = '0.03';
> +
> +# each tuple is [ offset, length ]
> +use constant _HASH_SPEC => ( [ 20, 3 ], [ 60, 3 ] );
> +
> +use constant {
> +    _MIN_LINE_LENGTH => 8,
> +
> +    _ATOMIC_NUM_LINES => 4,
> +};
> +
> +#----------------------------------------------------------------------
> +
> +=head1 FUNCTIONS
> +
> +=head2 $strings_ar = digest_payloads( $EMAIL_MIME )
> +
> +This imitates the corresponding object method in F<digest.py>.
> +It returns a reference to an array of strings. Each string can be either
> +a byte string or a character string (e.g., UTF-8 decoded).
> +
> +NB: RFC 2822 stipulates that message bodies should use CRLF
> +line breaks, not plain LF (nor plain CR). L<Email::MIME::Encodings>
> +will thus convert any plain CRs in a quoted-printable message
> +body into CRLF. Python, though, doesn???t do this, so the output of
> +our implementation of C<digest_payloads()> diverges from that of the Python
> +original. It doesn???t ultimately make a difference since the line-ending
> +whitespace gets trimmed regardless, but it???s necessary to factor in when
> +comparing the output of our implementation with the Python output.
> +
> +=cut
> +
> +sub digest_payloads {
> +    my ($parsed) = @_;
> +
> +    my @subparts = $parsed->subparts();
> +
> +    my @payloads;
> +
> +    if (@subparts) {
> +        @payloads = map { @{ digest_payloads($_) } } $parsed->subparts();
> +    }
> +    else {
> +        my ( $main_type, $subtype, $encoding, $encode_check ) = parse_content_type( $parsed->content_type() );
> +
> +        my $payload;
> +
> +        if ( $main_type eq 'text' ) {
> +
> +            # Decode transfer encoding, but leave us as a byte string.
> +            # Note that this is where Email::MIME converts plain LF to CRLF.
> +            $payload = $parsed->body();
> +
> +            # This does the actual character decoding (i.e., ???charset???).
> +            $payload = Encode::decode( $encoding, $payload, $encode_check );
> +
> +            if ( $subtype eq 'html' ) {
> +                require Mail::SpamAssassin::Pyzor::Digest::StripHtml;
> +                $payload = Mail::SpamAssassin::Pyzor::Digest::StripHtml::strip($payload);
> +            }
> +        }
> +        else {
> +
> +            # This does no decoding, even of, e.g., quoted-printable or base64.
> +            $payload = $parsed->body_raw();
> +        }
> +
> +        push @payloads, $payload;
> +    }
> +
> +    return \@payloads;
> +}
> +
> +#----------------------------------------------------------------------
> +
> +=head2 normalize( $STRING )
> +
> +This imitates the corresponding object method in F<digest.py>.
> +It modifies C<$STRING> in-place.
> +
> +As with the original implementation, if C<$STRING> contains (decoded)
> +Unicode characters, those characters will be parsed accordingly. So:
> +
> +    $str = "123\xc2\xa0";   # [ c2 a0 ] == \u00a0, non-breaking space
> +
> +    normalize($str);
> +
> +The above will leave C<$str> alone, but this:
> +
> +    utf8::decode($str);
> +
> +    normalize($str);
> +
> +??? will trim off the last two bytes from C<$str>.
> +
> +=cut
> +
> +sub normalize {    ## no critic qw( Subroutines::RequireArgUnpacking )
> +
> +    # NULs are bad, mm-kay?
> +    $_[0] =~ tr<\0><>d;
> +
> +    # NB: Python???s \s without re.UNICODE is the same as Perl???s \s
> +    # with the /a modifier.
> +    #
> +    # https://docs.python.org/2/library/re.html
> +    # https://perldoc.perl.org/perlrecharclass.html#Backslash-sequences
> +
> +    # Python: re.compile(r'\S{10,}')
> +    $_[0] =~ s<\S{10,}><>ag;
> +
> +    # Python: re.compile(r'\S+@\S+')
> +    $_[0] =~ s<\S+ @ \S+><>agx;
> +
> +    # Python: re.compile(r'[a-z]+:\S+', re.IGNORECASE)
> +    $_[0] =~ s<[a-zA-Z]+ : \S+><>agx;
> +
> +    # (from digest.py ???)
> +    # Make sure we do the whitespace last because some of the previous
> +    # patterns rely on whitespace.
> +    $_[0] =~ tr< \x09-\x0d><>d;
> +
> +    # This is fun. digest.py???s normalize() does a non-UNICODE whitespace
> +    # strip, then calls strip() on the string, which *will* strip Unicode
> +    # whitespace from the ends.
> +    $_[0] =~ s<\A\s+><>;
> +    $_[0] =~ s<\s+\z><>;
> +
> +    return;
> +}
> +
> +#----------------------------------------------------------------------
> +
> +=head2 $yn = should_handle_line( $STRING )
> +
> +This imitates the corresponding object method in F<digest.py>.
> +It returns a boolean.
> +
> +=cut
> +
> +sub should_handle_line {
> +    return $_[0] && length( $_[0] ) >= _MIN_LINE_LENGTH();
> +}
> +
> +#----------------------------------------------------------------------
> +
> +=head2 $sr = assemble_lines( \@LINES )
> +
> +This assembles a string buffer out of @LINES. The string is the buffer
> +of octets that will be hashed to produce the message digest.
> +
> +Each member of @LINES is expected to be an B<octet string>, not a
> +character string.
> +
> +=cut
> +
> +sub assemble_lines {
> +    my ($lines_ar) = @_;
> +
> +    if ( @$lines_ar <= _ATOMIC_NUM_LINES() ) {
> +
> +        # cf. handle_atomic() in digest.py
> +        return \join( q<>, @$lines_ar );
> +    }
> +
> +    #----------------------------------------------------------------------
> +    # cf. handle_atomic() in digest.py
> +
> +    my $str = q<>;
> +
> +    for my $ofs_len ( _HASH_SPEC() ) {
> +        my ( $offset, $length ) = @$ofs_len;
> +
> +        for my $i ( 0 .. ( $length - 1 ) ) {
> +            my $idx = int( $offset * @$lines_ar / 100 ) + $i;
> +
> +            next if !defined $lines_ar->[$idx];
> +
> +            $str .= $lines_ar->[$idx];
> +        }
> +    }
> +
> +    return \$str;
> +}
> +
> +#----------------------------------------------------------------------
> +
> +=head2 ($main, $sub, $encoding, $checkval) = parse_content_type( $CONTENT_TYPE )
> +
> +=cut
> +
> +use constant _QUOTED_PRINTABLE_NAMES => (
> +    "quopri-codec",
> +    "quopri",
> +    "quoted-printable",
> +    "quotedprintable",
> +);
> +
> +# Make Encode::decode() ignore anything that doesn???t fit the
> +# given encoding.
> +use constant _encode_check_ignore => q<>;
> +
> +sub parse_content_type {
> +    my ($content_type) = @_;
> +
> +    $Email::MIME::ContentType::STRICT_PARAMS = 0;
> +    my $ct_parse = Email::MIME::ContentType::parse_content_type(
> +        $content_type,
> +    );
> +
> +    my $main = $ct_parse->{'type'}    || q<>;
> +    my $sub  = $ct_parse->{'subtype'} || q<>;
> +
> +    my $encoding = $ct_parse->{'attributes'}{'charset'};
> +
> +    my $checkval;
> +
> +    if ($encoding) {
> +
> +        # Lower-case everything, convert underscore to dash, and remove NUL.
> +        $encoding =~ tr<A-Z_\0><a-z->d;
> +
> +        # Apparently pyzor accommodates messages that put the transfer
> +        # encoding in the Content-Type.
> +        if ( grep { $_ eq $encoding } _QUOTED_PRINTABLE_NAMES() ) {
> +            $checkval = Encode::FB_CROAK();
> +        }
> +    }
> +    else {
> +        $encoding = 'ascii';
> +    }
> +
> +    # Match Python .decode()???s 'ignore' behavior
> +    $checkval ||= \&_encode_check_ignore;
> +
> +    return ( $main, $sub, $encoding, $checkval );
> +}
> +
> +#----------------------------------------------------------------------
> +
> +=head2 @lines = splitlines( $TEXT )
> +
> +Imitates C<str.splitlines()>. (cf. C<pydoc str>)
> +
> +Returns a plain list in list context. Returns the number of
> +items to be returned in scalar context.
> +
> +=cut
> +
> +sub splitlines {
> +    return split m<\r\n?|\n>, $_[0];
> +}
> +
> +1;
> diff --git a/lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm b/lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm
> new file mode 100644
> index 0000000..2617b4a
> --- /dev/null
> +++ b/lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm
> @@ -0,0 +1,177 @@
> +package Mail::SpamAssassin::Pyzor::Digest::StripHtml;
> +
> +# Copyright 2018 cPanel, LLC.
> +# All rights reserved.
> +# http://cpanel.net
> +#
> +# <@LICENSE>
> +# Licensed to the Apache Software Foundation (ASF) under one or more
> +# contributor license agreements.  See the NOTICE file distributed with
> +# this work for additional information regarding copyright ownership.
> +# The ASF licenses this file to you under the Apache License, Version 2.0
> +# (the "License"); you may not use this file except in compliance with
> +# the License.  You may obtain a copy of the License at:
> +#
> +#     http://www.apache.org/licenses/LICENSE-2.0
> +#
> +# Unless required by applicable law or agreed to in writing, software
> +# distributed under the License is distributed on an "AS IS" BASIS,
> +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> +# See the License for the specific language governing permissions and
> +# limitations under the License.
> +# </...@LICENSE>
> +#
> +
> +use strict;
> +use warnings;
> +
> +=encoding utf-8
> +
> +=head1 NAME
> +
> +Mail::SpamAssassin::Pyzor::Digest::StripHtml
> +
> +=head1 SYNOPSIS
> +
> +    my $stripped = Mail::SpamAssassin::Pyzor::Digest::StripHtml::strip($html);
> +
> +=head1 DESCRIPTION
> +
> +This module attempts to duplicate pyzor???s HTML-stripping logic.
> +
> +=head1 ACCURACY
> +
> +This library cannot achieve 100%, bug-for-bug parity with pyzor
> +because to do so would require duplicating Python???s own HTML parsing
> +library. Since that library???s output has changed over time, and those
> +changes in turn affect pyzor, it???s literally impossible to arrive at
> +a single, fully-compatible reimplementation.
> +
> +That said, all known divergences between pyzor and this library involve
> +invalid HTML as input.
> +
> +Please open bug reports for any divergences you identify, particularly
> +if the input is valid HTML.
> +
> +=cut
> +
> +#----------------------------------------------------------------------
> +
> +use HTML::Parser ();
> +
> +our $VERSION = '0.03';
> +
> +#----------------------------------------------------------------------
> +
> +=head1 FUNCTIONS
> +
> +=head2 $stripped = strip( $HTML )
> +
> +Give it some HTML, and it???ll give back the stripped text.
> +
> +In B<general>, the stripping consists of removing tags as well as
> +C<E<lt>scriptE<gt>> and C<E<lt>styleE<gt>> elements; however, it also
> +removes HTML entities.
> +
> +This tries very hard to duplicate pyzor???s behavior with invalid HTML.
> +
> +=cut
> +
> +sub strip {
> +    my ($html) = @_;
> +
> +    $html =~ s<\A\s+><>;
> +    $html =~ s<\s+\z><>;
> +
> +    my $p = HTML::Parser->new( api_version => 3 );
> +
> +    my @pieces;
> +
> +    my $accumulate = 1;
> +
> +    $p->handler(
> +        start => sub {
> +            my ($tagname) = @_;
> +
> +            $accumulate = 0 if $tagname eq 'script';
> +            $accumulate = 0 if $tagname eq 'style';
> +
> +            return;
> +        },
> +        'tagname',
> +    );
> +
> +    $p->handler(
> +        end => sub {
> +            $accumulate = 1;
> +            return;
> +        }
> +    );
> +
> +    $p->handler(
> +        text => sub {
> +            my ($copy) = @_;
> +
> +            return if !$accumulate;
> +
> +            # pyzor???s HTML parser discards HTML entities. On top of that,
> +            # we need to match, as closely as possible, pyzor???s handling of
> +            # invalid HTML entities ??? which is a function of Python???s
> +            # standard HTML parsing library. This will probably never be
> +            # fully compatible with the pyzor, but we can get it close.
> +
> +            # The original is:
> +            #
> +            #   re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
> +            #
> +            # The parsing loop then ???backs up??? one byte if the last
> +            # character isn???t a ???;???. We use a look-ahead assertion to
> +            # mimic that behavior.
> +            $copy =~ s<\&\# (?:[0-9]+ | [xX][0-9a-fA-F]+) (?: ; | \z | (?=[^0-9a-fA-F]) )>< >gx;
> +
> +            # The original is:
> +            #
> +            #   re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
> +            #
> +            # We again use a look-ahead assertion to mimic Python.
> +            $copy =~ s<\& [a-zA-Z] [-.a-zA-Z0-9]* (?: ; | \z | (?=[^a-zA-Z0-9]) )>< >gx;
> +
> +            # Python???s HTMLParser aborts its parsing loop when it encounters
> +            # an invalid numeric reference.
> +            $copy =~ s<\&\#
> +                (?:
> +                    [^0-9xX]        # anything but the expected first char
> +                    |
> +                    [0-9]+[a-fA-F]  # hex within decimal
> +                    |
> +                    [xX][^0-9a-fA-F]
> +                )
> +                (.*)
> +            ><
> +                ( -1 == index($1, ';') ) ? q<> : '&#'
> +            >exs;
> +
> +            # Python???s HTMLParser treats invalid entities as incomplete
> +            $copy =~ s<(\&\#?)><$1 >gx;
> +
> +            $copy =~ s<\A\s+><>;
> +            $copy =~ s<\s+\z><>;
> +
> +            push @pieces, \$copy if length $copy;
> +        },
> +        'text,tagname',
> +    );
> +
> +    $p->parse($html);
> +    $p->eof();
> +
> +    my $payload = join( q< >, map { $$_ } @pieces );
> +
> +    # Convert all sequences of whitespace OTHER THAN non-breaking spaces to
> +    # plain spaces.
> +    $payload =~ s<[^\S\x{a0}]+>< >g;
> +
> +    return $payload;
> +}
> +
> +1;
> diff --git a/t/pyzor.t b/t/pyzor.t
> index 891f38d..e4ef83f 100755
> --- a/t/pyzor.t
> +++ b/t/pyzor.t
> @@ -3,12 +3,9 @@
>  use lib '.'; use lib 't';
>  use SATest; sa_t_init("pyzor");
>  
> -use constant HAS_PYZOR => eval { $_ = untaint_cmd("which pyzor"); chomp; -x };
> -
>  use Test::More;
>  plan skip_all => "Net tests disabled" unless conf_bool('run_net_tests');
> -plan skip_all => "Pyzor executable not found in path" unless HAS_PYZOR;
> -plan tests => 8;
> +plan tests => 5;
>  
>  diag('Note: Failures may not be an SpamAssassin bug, as Pyzor tests can fail due to problems with the Pyzor servers.');
>  
> @@ -30,7 +27,7 @@ tstprefs ("
>  sarun ("-t < data/spam/pyzor", \&patterns_run_cb);
>  ok_all_patterns();
>  # Same with fork
> -sarun ("--cf='pyzor_fork 1' -t < data/spam/pyzor", \&patterns_run_cb);
> +sarun ("-t < data/spam/pyzor", \&patterns_run_cb);
>  ok_all_patterns();
>  
>  #TESTING FOR HAM
> @@ -44,7 +41,3 @@ ok_all_patterns();
>  
>  sarun ("-D pyzor -t < data/nice/001 2>&1", \&patterns_run_cb);
>  ok_all_patterns();
> -# same with fork
> -sarun ("-D pyzor --cf='pyzor_fork 1' -t < data/nice/001 2>&1", \&patterns_run_cb);
> -ok_all_patterns();
> -