You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@spamassassin.apache.org by William Taylor <wi...@sonic.com> on 2019/01/10 23:04:02 UTC

Plugin::Phishing is slow

We just enabled Mail::SpamAssassin::Plugin::Phishing and realized it was 
causing our spam processing to slow way down.

Looks like the slow down is caused by matching urls with grep against an 
array. Here is some sample code that shows the difference in speed.

Output should be similar to this below. Notice the performance difference between the
two? I didn't check memory usage but I can't imagine it being that bad plus
ram is cheap and I'd rather see processing as fast as possible.

I can provide a proposed patch if needed but its pretty straight forward to switch
to hash based instead of an array.

This shows worst case scenario of not finding a match.
                  Rate originalCode      newCode
originalCode    7877/s           --        -100%
newCode      4139773/s       52456%           --

This shows best case scenario of first element matching
                  Rate originalCode      newCode
originalCode    7907/s           --        -100%
newCode      3665454/s       46260%           --

This shows element matching about halfway through the list
                  Rate originalCode      newCode
originalCode    7780/s           --        -100%
newCode      3668842/s       47060%           --



#!/usr/bin/perl
use strict;
use warnings;
use Benchmark qw(:all);
use File::Slurp;
use LWP::Simple;
use POSIX qw(floor);

my $feed = 'feed.txt';

# Download feed file if needed
if ( !-f $feed )
{
    unless ( getstore( 'https://openphish.com/feed.txt', $feed ) )
    {
        die "Could not download feed\n";
    }
}

my $pms = {};
$pms->{PHISHING} = {};
$pms->{PHISHING}->{phishurl} = [];
my $info = {};
$info->{cleaned} = [
    qw(
        http://www.example.com/phishurl
        )
];

my @urls = read_file($feed);
chomp @urls;
my $halfCount = floor( scalar(@urls) / 2 );

$pms->{PHISHING}{phishurl} = \@urls;
my %hash = map { $_ => undef } @{ $pms->{PHISHING}->{phishurl} };

### Worst Case
print "This shows worst case scenario of not finding a match.\n";
cmpthese(
    -10,
    {   originalCode => \&originalCode,
        newCode      => \&newCode
    }
);

### Best Case
unshift @{ $pms->{PHISHING}{phishurl} }, "http://www.example.com/phishurl";
%hash = map { $_ => undef } @{ $pms->{PHISHING}->{phishurl} };

print "\nThis shows best case scenario of first element matching\n";
cmpthese(
    -10,
    {   originalCode => \&originalCode,
        newCode      => \&newCode
    }
);

#### Halfway match
@urls = read_file($feed);
chomp @urls;
$pms->{PHISHING}{phishurl} = \@urls;
splice( @{ $pms->{PHISHING}{phishurl} }, $halfCount, 0, "http://www.example.com/phishurl" );
%hash = map { $_ => undef } @{ $pms->{PHISHING}->{phishurl} };

print "\nThis shows element matching about halfway through the list\n";
cmpthese(
    -10,
    {   originalCode => \&originalCode,
        newCode      => \&newCode
    }
);

exit;

# Note this gets a lot worse if an email contains multiple urls to check.
# Also original code had length check. This is really unecesarry. Benchmarks
# weren't done with it but I'd imagine it adds some slowness. If really parinoid
# about it, do the length check when adding to the array or hash.
sub originalCode
{
    foreach my $cluri ( @{ $info->{cleaned} } )
    {
        if ( grep { $cluri eq $_ } @{ $pms->{PHISHING}->{phishurl} } )
        {
            return 1;
        }
    }
}

sub newCode
{
    foreach my $cluri ( @{ $info->{cleaned} } )
    {
        return 1 if exists $hash{$cluri};
    }
}


Re: Plugin::Phishing is slow

Posted by Giovanni Bechis <gi...@paclan.it>.
On 1/11/19 12:04 AM, William Taylor wrote:
> We just enabled Mail::SpamAssassin::Plugin::Phishing and realized it was 
> causing our spam processing to slow way down.
> 
> Looks like the slow down is caused by matching urls with grep against an 
> array. Here is some sample code that shows the difference in speed.
> 
> Output should be similar to this below. Notice the performance difference between the
> two? I didn't check memory usage but I can't imagine it being that bad plus
> ram is cheap and I'd rather see processing as fast as possible.
> 
> I can provide a proposed patch if needed but its pretty straight forward to switch
> to hash based instead of an array.
> 
thanks, I committed some improvements.
 Cheers
  Giovanni Bechis


> This shows worst case scenario of not finding a match.
>                   Rate originalCode      newCode
> originalCode    7877/s           --        -100%
> newCode      4139773/s       52456%           --
> 
> This shows best case scenario of first element matching
>                   Rate originalCode      newCode
> originalCode    7907/s           --        -100%
> newCode      3665454/s       46260%           --
> 
> This shows element matching about halfway through the list
>                   Rate originalCode      newCode
> originalCode    7780/s           --        -100%
> newCode      3668842/s       47060%           --
> 
> 
> 
> #!/usr/bin/perl
> use strict;
> use warnings;
> use Benchmark qw(:all);
> use File::Slurp;
> use LWP::Simple;
> use POSIX qw(floor);
> 
> my $feed = 'feed.txt';
> 
> # Download feed file if needed
> if ( !-f $feed )
> {
>     unless ( getstore( 'https://openphish.com/feed.txt', $feed ) )
>     {
>         die "Could not download feed\n";
>     }
> }
> 
> my $pms = {};
> $pms->{PHISHING} = {};
> $pms->{PHISHING}->{phishurl} = [];
> my $info = {};
> $info->{cleaned} = [
>     qw(
>         http://www.example.com/phishurl
>         )
> ];
> 
> my @urls = read_file($feed);
> chomp @urls;
> my $halfCount = floor( scalar(@urls) / 2 );
> 
> $pms->{PHISHING}{phishurl} = \@urls;
> my %hash = map { $_ => undef } @{ $pms->{PHISHING}->{phishurl} };
> 
> ### Worst Case
> print "This shows worst case scenario of not finding a match.\n";
> cmpthese(
>     -10,
>     {   originalCode => \&originalCode,
>         newCode      => \&newCode
>     }
> );
> 
> ### Best Case
> unshift @{ $pms->{PHISHING}{phishurl} }, "http://www.example.com/phishurl";
> %hash = map { $_ => undef } @{ $pms->{PHISHING}->{phishurl} };
> 
> print "\nThis shows best case scenario of first element matching\n";
> cmpthese(
>     -10,
>     {   originalCode => \&originalCode,
>         newCode      => \&newCode
>     }
> );
> 
> #### Halfway match
> @urls = read_file($feed);
> chomp @urls;
> $pms->{PHISHING}{phishurl} = \@urls;
> splice( @{ $pms->{PHISHING}{phishurl} }, $halfCount, 0, "http://www.example.com/phishurl" );
> %hash = map { $_ => undef } @{ $pms->{PHISHING}->{phishurl} };
> 
> print "\nThis shows element matching about halfway through the list\n";
> cmpthese(
>     -10,
>     {   originalCode => \&originalCode,
>         newCode      => \&newCode
>     }
> );
> 
> exit;
> 
> # Note this gets a lot worse if an email contains multiple urls to check.
> # Also original code had length check. This is really unecesarry. Benchmarks
> # weren't done with it but I'd imagine it adds some slowness. If really parinoid
> # about it, do the length check when adding to the array or hash.
> sub originalCode
> {
>     foreach my $cluri ( @{ $info->{cleaned} } )
>     {
>         if ( grep { $cluri eq $_ } @{ $pms->{PHISHING}->{phishurl} } )
>         {
>             return 1;
>         }
>     }
> }
> 
> sub newCode
> {
>     foreach my $cluri ( @{ $info->{cleaned} } )
>     {
>         return 1 if exists $hash{$cluri};
>     }
> }
>