You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@spamassassin.apache.org by "Kevin A. McGrail" <km...@apache.org> on 2020/07/26 14:07:44 UTC
Re: svn commit: r1880308 - /spamassassin/trunk/masses/hit-frequencies

Nice!

On Sun, Jul 26, 2020, 01:50 <he...@apache.org> wrote:

> Author: hege
> Date: Sun Jul 26 05:50:00 2020
> New Revision: 1880308
>
> URL: http://svn.apache.org/viewvc?rev=1880308&view=rev
> Log:
> Tweaks to increase speed, cut runtime in half
>
> Modified:
>     spamassassin/trunk/masses/hit-frequencies
>
> Modified: spamassassin/trunk/masses/hit-frequencies
> URL:
> http://svn.apache.org/viewvc/spamassassin/trunk/masses/hit-frequencies?rev=1880308&r1=1880307&r2=1880308&view=diff
>
> ==============================================================================
> --- spamassassin/trunk/masses/hit-frequencies (original)
> +++ spamassassin/trunk/masses/hit-frequencies Sun Jul 26 05:50:00 2020
> @@ -805,52 +805,48 @@ sub compute_overlaps_for_rule {
>    my %overlaps_ham1r = ();
>    my %overlaps_spam1r = ();
>
> -  foreach my $r2 (keys %hmap_spam) {
> -    next if $r1 eq $r2;
> -
> -    # require that both rules have at least 1 hit
> -    next unless ($freq_spam{$r1} && $freq_spam{$r2});
> -
> -    my ($a1ina2, $a2ina1) = _hmap_to_overlap_ratio ($r2, $r1,
> -                                    $hmap_spam{$r2}, $hmap_spam{$r1});
> -
> -    if ($a1ina2 > 0)
> -    {
> -      $overlaps_spam1r{$r2} = $a1ina2;
> -
> -      if (exists $overlaps_spam1{$a1ina2})
> -      { $overlaps_spam1{$a1ina2} .= " ".$r2."[$a2ina1]"; }
> -      else { $overlaps_spam1{$a1ina2} = $r2."[$a2ina1]"; }
> -
> -      if (exists $overlaps_spam2{$a2ina1})
> -      { $overlaps_spam2{$a2ina1} .= " ".$r2."[$a2ina1]"; }
> -      else { $overlaps_spam2{$a2ina1} = $r2."[$a2ina1]"; }
> +  if ($freq_spam{$r1}) {
> +    foreach my $r2 (keys %hmap_spam) {
> +      next if $r1 eq $r2;
> +
> +      my ($a1ina2, $a2ina1) = _hmap_to_overlap_ratio ($r2, $r1,
> +                                      $hmap_spam{$r2}, $hmap_spam{$r1});
> +
> +      if ($a1ina2 > 0)
> +      {
> +        $overlaps_spam1r{$r2} = $a1ina2;
> +
> +        if (exists $overlaps_spam1{$a1ina2})
> +        { $overlaps_spam1{$a1ina2} .= " ".$r2."[$a2ina1]"; }
> +        else { $overlaps_spam1{$a1ina2} = $r2."[$a2ina1]"; }
> +
> +        if (exists $overlaps_spam2{$a2ina1})
> +        { $overlaps_spam2{$a2ina1} .= " ".$r2."[$a2ina1]"; }
> +        else { $overlaps_spam2{$a2ina1} = $r2."[$a2ina1]"; }
> +      }
>      }
> -
>    }
>
> -  foreach my $r2 (keys %hmap_ham) {
> -    next if $r1 eq $r2;
> -
> -    # require that both rules have at least 1 hit
> -    next unless ($freq_ham{$r1} && $freq_ham{$r2});
> -
> -    my ($a1ina2, $a2ina1) = _hmap_to_overlap_ratio ($r1, $r2,
> -                                    $hmap_ham{$r2}, $hmap_ham{$r1});
> -
> -    if ($a1ina2 > 0)
> -    {
> -      $overlaps_ham1r{$r2} = $a1ina2;
> -
> -      if (exists $overlaps_ham1{$a1ina2})
> -      { $overlaps_ham1{$a1ina2} .= " ".$r2."[$a2ina1]"; }
> -      else { $overlaps_ham1{$a1ina2} = $r2."[$a2ina1]"; }
> -
> -      if (exists $overlaps_ham2{$a2ina1})
> -      { $overlaps_ham2{$a2ina1} .= " ".$r2."[$a1ina2]"; }
> -      else { $overlaps_ham2{$a2ina1} = $r2."[$a1ina2]"; }
> +  if ($freq_ham{$r1}) {
> +    foreach my $r2 (keys %hmap_ham) {
> +      next if $r1 eq $r2;
> +
> +      my ($a1ina2, $a2ina1) = _hmap_to_overlap_ratio ($r1, $r2,
> +                                      $hmap_ham{$r2}, $hmap_ham{$r1});
> +
> +      if ($a1ina2 > 0)
> +      {
> +        $overlaps_ham1r{$r2} = $a1ina2;
> +
> +        if (exists $overlaps_ham1{$a1ina2})
> +        { $overlaps_ham1{$a1ina2} .= " ".$r2."[$a2ina1]"; }
> +        else { $overlaps_ham1{$a1ina2} = $r2."[$a2ina1]"; }
> +
> +        if (exists $overlaps_ham2{$a2ina1})
> +        { $overlaps_ham2{$a2ina1} .= " ".$r2."[$a1ina2]"; }
> +        else { $overlaps_ham2{$a2ina1} = $r2."[$a1ina2]"; }
> +      }
>      }
> -
>    }
>
>    _print_overlap_ratios($r1, \%overlaps_spam1, \%overlaps_spam2, "spam",
> \%overlaps_ham1r, "ham");
> @@ -934,25 +930,23 @@ sub _prettify_overlap_rules {
>  sub _hmap_to_overlap_ratio {
>    my ($r1, $r2, $hmap1, $hmap2) = @_;
>
> -  $hmap1 ||= '';
> -  $hmap2 ||= '';
> -  if ($hmap1 !~ /[^\000]/ || $hmap2 !~ /[^\000]/) {
> -    # no hits on either! this would normally give a 100% hitrate match,
> -    # but that's misleading -- so hide it by giving it a 0% overlap.
> -    #
> -    # also, ignore cases where there are no hits on *one* of the rules,
> -    # while there are hits on the other -- after all, if one rule doesn't
> -    # have a single hit, it cannot overlap.
> -    #
> -    return (0,0);
> -  }
> -
>    # my $i; for ($i = 0; $i < length($hmap1)*8; $i++) { print
> vec($hmap1,$i,1); } print "\n"; for ($i = 0; $i < length($hmap2)*8; $i++) {
> print vec($hmap2,$i,1); } print "\n";
>
>    # count bits in each, so we can show when one is fully subsumed by
> another
>    # with perl's support for bitstring ops, we get C speed here, nice!
> +
> +  # no hits on either? this would normally give a 100% hitrate match,
> +  # but that's misleading -- so hide it by giving it a 0% overlap.
> +  #
> +  # also, ignore cases where there are no hits on *one* of the rules,
> +  # while there are hits on the other -- after all, if one rule doesn't
> +  # have a single hit, it cannot overlap.
> +
>    my $a1 = unpack("%32b*", $hmap1);
> +  return (0,0) unless $a1;
>    my $a2 = unpack("%32b*", $hmap2);
> +  return (0,0) unless $a2;
> +
>    my $a1_and_a2 = unpack("%32b*", ($hmap1 & $hmap2));
>
>    # round rather than truncate
>
>
>