You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@spamassassin.apache.org by William Taylor <wi...@sonic.com> on 2019/01/10 23:04:02 UTC
Plugin::Phishing is slow
We just enabled Mail::SpamAssassin::Plugin::Phishing and realized it was
causing our spam processing to slow way down.
Looks like the slow down is caused by matching urls with grep against an
array. Here is some sample code that shows the difference in speed.
Output should be similar to this below. Notice the performance difference between the
two? I didn't check memory usage but I can't imagine it being that bad plus
ram is cheap and I'd rather see processing as fast as possible.
I can provide a proposed patch if needed but its pretty straight forward to switch
to hash based instead of an array.
This shows worst case scenario of not finding a match.
Rate originalCode newCode
originalCode 7877/s -- -100%
newCode 4139773/s 52456% --
This shows best case scenario of first element matching
Rate originalCode newCode
originalCode 7907/s -- -100%
newCode 3665454/s 46260% --
This shows element matching about halfway through the list
Rate originalCode newCode
originalCode 7780/s -- -100%
newCode 3668842/s 47060% --
#!/usr/bin/perl
use strict;
use warnings;
use Benchmark qw(:all);
use File::Slurp;
use LWP::Simple;
use POSIX qw(floor);
my $feed = 'feed.txt';
# Download feed file if needed
if ( !-f $feed )
{
unless ( getstore( 'https://openphish.com/feed.txt', $feed ) )
{
die "Could not download feed\n";
}
}
my $pms = {};
$pms->{PHISHING} = {};
$pms->{PHISHING}->{phishurl} = [];
my $info = {};
$info->{cleaned} = [
qw(
http://www.example.com/phishurl
)
];
my @urls = read_file($feed);
chomp @urls;
my $halfCount = floor( scalar(@urls) / 2 );
$pms->{PHISHING}{phishurl} = \@urls;
my %hash = map { $_ => undef } @{ $pms->{PHISHING}->{phishurl} };
### Worst Case
print "This shows worst case scenario of not finding a match.\n";
cmpthese(
-10,
{ originalCode => \&originalCode,
newCode => \&newCode
}
);
### Best Case
unshift @{ $pms->{PHISHING}{phishurl} }, "http://www.example.com/phishurl";
%hash = map { $_ => undef } @{ $pms->{PHISHING}->{phishurl} };
print "\nThis shows best case scenario of first element matching\n";
cmpthese(
-10,
{ originalCode => \&originalCode,
newCode => \&newCode
}
);
#### Halfway match
@urls = read_file($feed);
chomp @urls;
$pms->{PHISHING}{phishurl} = \@urls;
splice( @{ $pms->{PHISHING}{phishurl} }, $halfCount, 0, "http://www.example.com/phishurl" );
%hash = map { $_ => undef } @{ $pms->{PHISHING}->{phishurl} };
print "\nThis shows element matching about halfway through the list\n";
cmpthese(
-10,
{ originalCode => \&originalCode,
newCode => \&newCode
}
);
exit;
# Note this gets a lot worse if an email contains multiple urls to check.
# Also original code had length check. This is really unecesarry. Benchmarks
# weren't done with it but I'd imagine it adds some slowness. If really parinoid
# about it, do the length check when adding to the array or hash.
sub originalCode
{
foreach my $cluri ( @{ $info->{cleaned} } )
{
if ( grep { $cluri eq $_ } @{ $pms->{PHISHING}->{phishurl} } )
{
return 1;
}
}
}
sub newCode
{
foreach my $cluri ( @{ $info->{cleaned} } )
{
return 1 if exists $hash{$cluri};
}
}
Re: Plugin::Phishing is slow
Posted by Giovanni Bechis <gi...@paclan.it>.
On 1/11/19 12:04 AM, William Taylor wrote:
> We just enabled Mail::SpamAssassin::Plugin::Phishing and realized it was
> causing our spam processing to slow way down.
>
> Looks like the slow down is caused by matching urls with grep against an
> array. Here is some sample code that shows the difference in speed.
>
> Output should be similar to this below. Notice the performance difference between the
> two? I didn't check memory usage but I can't imagine it being that bad plus
> ram is cheap and I'd rather see processing as fast as possible.
>
> I can provide a proposed patch if needed but its pretty straight forward to switch
> to hash based instead of an array.
>
thanks, I committed some improvements.
Cheers
Giovanni Bechis
> This shows worst case scenario of not finding a match.
> Rate originalCode newCode
> originalCode 7877/s -- -100%
> newCode 4139773/s 52456% --
>
> This shows best case scenario of first element matching
> Rate originalCode newCode
> originalCode 7907/s -- -100%
> newCode 3665454/s 46260% --
>
> This shows element matching about halfway through the list
> Rate originalCode newCode
> originalCode 7780/s -- -100%
> newCode 3668842/s 47060% --
>
>
>
> #!/usr/bin/perl
> use strict;
> use warnings;
> use Benchmark qw(:all);
> use File::Slurp;
> use LWP::Simple;
> use POSIX qw(floor);
>
> my $feed = 'feed.txt';
>
> # Download feed file if needed
> if ( !-f $feed )
> {
> unless ( getstore( 'https://openphish.com/feed.txt', $feed ) )
> {
> die "Could not download feed\n";
> }
> }
>
> my $pms = {};
> $pms->{PHISHING} = {};
> $pms->{PHISHING}->{phishurl} = [];
> my $info = {};
> $info->{cleaned} = [
> qw(
> http://www.example.com/phishurl
> )
> ];
>
> my @urls = read_file($feed);
> chomp @urls;
> my $halfCount = floor( scalar(@urls) / 2 );
>
> $pms->{PHISHING}{phishurl} = \@urls;
> my %hash = map { $_ => undef } @{ $pms->{PHISHING}->{phishurl} };
>
> ### Worst Case
> print "This shows worst case scenario of not finding a match.\n";
> cmpthese(
> -10,
> { originalCode => \&originalCode,
> newCode => \&newCode
> }
> );
>
> ### Best Case
> unshift @{ $pms->{PHISHING}{phishurl} }, "http://www.example.com/phishurl";
> %hash = map { $_ => undef } @{ $pms->{PHISHING}->{phishurl} };
>
> print "\nThis shows best case scenario of first element matching\n";
> cmpthese(
> -10,
> { originalCode => \&originalCode,
> newCode => \&newCode
> }
> );
>
> #### Halfway match
> @urls = read_file($feed);
> chomp @urls;
> $pms->{PHISHING}{phishurl} = \@urls;
> splice( @{ $pms->{PHISHING}{phishurl} }, $halfCount, 0, "http://www.example.com/phishurl" );
> %hash = map { $_ => undef } @{ $pms->{PHISHING}->{phishurl} };
>
> print "\nThis shows element matching about halfway through the list\n";
> cmpthese(
> -10,
> { originalCode => \&originalCode,
> newCode => \&newCode
> }
> );
>
> exit;
>
> # Note this gets a lot worse if an email contains multiple urls to check.
> # Also original code had length check. This is really unecesarry. Benchmarks
> # weren't done with it but I'd imagine it adds some slowness. If really parinoid
> # about it, do the length check when adding to the array or hash.
> sub originalCode
> {
> foreach my $cluri ( @{ $info->{cleaned} } )
> {
> if ( grep { $cluri eq $_ } @{ $pms->{PHISHING}->{phishurl} } )
> {
> return 1;
> }
> }
> }
>
> sub newCode
> {
> foreach my $cluri ( @{ $info->{cleaned} } )
> {
> return 1 if exists $hash{$cluri};
> }
> }
>