You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2006/12/07 21:37:32 UTC

svn commit: r483650 - in /spamassassin/trunk: MANIFEST lib/Mail/SpamAssassin/Conf.pm lib/Mail/SpamAssassin/Conf/Parser.pm lib/Mail/SpamAssassin/PerMsgStatus.pm masses/mass-check masses/plugins/Dumpmem.pm t/duplicates.t t/priorities.t

Author: jm
Date: Thu Dec  7 12:37:31 2006
New Revision: 483650

URL: http://svn.apache.org/viewvc?view=rev&rev=483650
Log:
bug 5206: detect duplicate rules, and silently merge them internally for greater efficiency.  This results in about 100-120KB RAM usage saving in current svn trunk's ruleset, detecting lots of duplicate rules -- so is well worth doing.  also, change t/priorities.t so it doesn't accidentally confuse itself with duplicate rules

Added:
    spamassassin/trunk/masses/plugins/Dumpmem.pm
    spamassassin/trunk/t/duplicates.t   (with props)
Modified:
    spamassassin/trunk/MANIFEST
    spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/Conf/Parser.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
    spamassassin/trunk/masses/mass-check
    spamassassin/trunk/t/priorities.t

Modified: spamassassin/trunk/MANIFEST
URL: http://svn.apache.org/viewvc/spamassassin/trunk/MANIFEST?view=diff&rev=483650&r1=483649&r2=483650
==============================================================================
--- spamassassin/trunk/MANIFEST (original)
+++ spamassassin/trunk/MANIFEST Thu Dec  7 12:37:31 2006
@@ -465,3 +465,4 @@
 t/spamd_unix_and_tcp.t
 lib/Mail/SpamAssassin/Plugin/VBounce.pm
 rules/20_vbounce.cf
+t/duplicates.t

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm?view=diff&rev=483650&r1=483649&r2=483650
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm Thu Dec  7 12:37:31 2006
@@ -2629,6 +2629,7 @@
   $self->{rawbody_evals} = { };
   $self->{meta_tests} = { };
   $self->{eval_plugins} = { };
+  $self->{duplicate_rules} = { };
 
   # testing stuff
   $self->{regression_tests} = { };

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Conf/Parser.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Conf/Parser.pm?view=diff&rev=483650&r1=483649&r2=483650
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Conf/Parser.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Conf/Parser.pm Thu Dec  7 12:37:31 2006
@@ -685,6 +685,7 @@
 
   $self->trace_meta_dependencies();
   $self->fix_priorities();
+  $self->find_dup_rules();          # must be after fix_priorities()
 
   dbg("conf: finish parsing");
 
@@ -838,6 +839,51 @@
         $pri->{$dep} = $basepri;
       }
     }
+  }
+}
+
+sub find_dup_rules {
+  my ($self) = @_;
+  my $conf = $self->{conf};
+
+  my %names_for_text = ();
+  my %dups = ();
+  while (my ($name, $text) = each %{$conf->{tests}}) {
+    my $type = $conf->{test_types}->{$name};
+    # ensure similar, but differently-typed, rules are not marked as dups
+    $text = "$type\t$text";      
+
+    if (defined $names_for_text{$text}) {
+      $names_for_text{$text} .= " ".$name;
+      $dups{$text} = 1;     # found (at least) one
+    } else {
+      $names_for_text{$text} = $name;
+    }
+  }
+
+  foreach my $text (keys %dups) {
+    my $first;
+    my $first_pri;
+    my @names = sort {$a cmp $b} split(' ', $names_for_text{$text});
+    foreach my $name (@names) {
+      my $priority = $conf->{priority}->{$name} || 0;
+
+      if (!defined $first || $priority < $first_pri) {
+        $first_pri = $priority;
+        $first = $name;
+      }
+    }
+    # $first is now the earliest-occurring rule. mark others as dups
+
+    my @dups = ();
+    foreach my $name (@names) {
+      next if $name eq $first;
+      push @dups, $name;
+      delete $conf->{tests}->{$name};
+    }
+
+    dbg("rules: $first has duplicates: ".join(' ', @dups));
+    $conf->{duplicate_rules}->{$first} = \@dups;
   }
 }
 

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm?view=diff&rev=483650&r1=483649&r2=483650
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm Thu Dec  7 12:37:31 2006
@@ -2115,6 +2115,15 @@
             $area,
             $params{ruletype},
             $self->{conf}->get_description_for_rule($rule) || $rule);
+
+  # take care of duplicate rules, too (bug 5206)
+  my $dups = $self->{conf}->{duplicate_rules}->{$rule};
+  if ($dups && @{$dups}) {
+    foreach my $dup (@{$dups}) {
+      $self->got_hit($dup, $area, %params);
+    }
+  }
+
   return 1;
 }
 

Modified: spamassassin/trunk/masses/mass-check
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/mass-check?view=diff&rev=483650&r1=483649&r2=483650
==============================================================================
--- spamassassin/trunk/masses/mass-check (original)
+++ spamassassin/trunk/masses/mass-check Thu Dec  7 12:37:31 2006
@@ -736,8 +736,6 @@
     }
   }
 
-# use Mail::SpamAssassin::Util::MemoryDump; Mail::SpamAssassin::Util::MemoryDump::MEMDEBUG(); use Mail::SpamAssassin::Util::MemoryDump; Mail::SpamAssassin::Util::MemoryDump::MEMDEBUG_dump_obj($status); #JMD
-
   if (defined $status) { $status->finish(); }
   $ma->finish();
   undef $ma;		# clean 'em up

Added: spamassassin/trunk/masses/plugins/Dumpmem.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/plugins/Dumpmem.pm?view=auto&rev=483650
==============================================================================
--- spamassassin/trunk/masses/plugins/Dumpmem.pm (added)
+++ spamassassin/trunk/masses/plugins/Dumpmem.pm Thu Dec  7 12:37:31 2006
@@ -0,0 +1,24 @@
+package Dumpmem;
+use strict;
+use Mail::SpamAssassin;
+use Mail::SpamAssassin::Plugin;
+our @ISA = qw(Mail::SpamAssassin::Plugin);
+
+use Mail::SpamAssassin::Util::MemoryDump;
+
+sub new {
+  my ($class, $mailsa) = @_;
+  $class = ref($class) || $class;
+  my $self = $class->SUPER::new($mailsa);
+  bless ($self, $class);
+  return $self;
+}
+
+sub per_msg_finish {
+  my ($self, $opts) = @_;
+  Mail::SpamAssassin::Util::MemoryDump::MEMDEBUG();
+  Mail::SpamAssassin::Util::MemoryDump::MEMDEBUG_dump_obj(
+                                $opts->{permsgstatus}->{main});
+}
+
+1;

Added: spamassassin/trunk/t/duplicates.t
URL: http://svn.apache.org/viewvc/spamassassin/trunk/t/duplicates.t?view=auto&rev=483650
==============================================================================
--- spamassassin/trunk/t/duplicates.t (added)
+++ spamassassin/trunk/t/duplicates.t Thu Dec  7 12:37:31 2006
@@ -0,0 +1,60 @@
+#!/usr/bin/perl
+
+use lib '.'; use lib 't';
+use SATest; sa_t_init("duplicates");
+use Test; BEGIN { plan tests => 16 };
+
+$ENV{'LANGUAGE'} = $ENV{'LC_ALL'} = 'C';             # a cheat, but we need the patterns to work
+
+# ---------------------------------------------------------------------------
+
+%patterns = (
+
+  q{ FOO1 }     => '',  # use default names
+  q{ FOO2 }     => '',
+  q{ HDR1 }     => '',
+  q{ HDR2 }     => '',
+  q{ META1 }     => '',
+  q{ META2 }     => '',
+  q{ META3 }     => '',
+  q{ ran body rule FOO1 ======> got hit } => '',
+  q{ ran header rule HDR1 ======> got hit } => '',
+  q{ rules: FOO1 has duplicates: FOO2 } => '',
+  q{ rules: HDR1 has duplicates: HDR2 } => '',
+
+);
+
+%anti_patterns = (
+
+  q{ FOO3 }     => '',
+  q{ RAWFOO }   => '',
+  q{ ran body rule FOO2 ======> got hit } => '',
+  q{ ran header rule HDR2 ======> got hit } => '',
+
+);
+
+tstprefs (qq{
+
+   $default_cf_lines
+
+   body FOO1 /click here and e= nter your/i
+   body FOO2 /click here and e= nter your/i
+
+   # should not be found, not a dup (/i)
+   body FOO3 /click here and e= nter your/
+
+   # should not be found, not dup since different type
+   rawbody RAWFOO /click here and e= nter your/i
+
+   header HDR1 Subject =~ /stained/
+   header HDR2 Subject =~ /stained/
+
+   meta META1 (1)
+   meta META2 (META1 && META3)
+   meta META3 (1)
+   priority META3 -500
+
+});
+
+sarun ("-L -t -D < data/spam/006 2>&1", \&patterns_run_cb);
+ok ok_all_patterns();

Propchange: spamassassin/trunk/t/duplicates.t
------------------------------------------------------------------------------
    svn:executable = *

Modified: spamassassin/trunk/t/priorities.t
URL: http://svn.apache.org/viewvc/spamassassin/trunk/t/priorities.t?view=diff&rev=483650&r1=483649&r2=483650
==============================================================================
--- spamassassin/trunk/t/priorities.t (original)
+++ spamassassin/trunk/t/priorities.t Thu Dec  7 12:37:31 2006
@@ -63,8 +63,8 @@
   meta FOO1 (FOO2 && FOO3)
   meta FOO2 (1)
   meta FOO3 (FOO4 && FOO5)
-  meta FOO4 (1)
-  meta FOO5 (1)
+  meta FOO4 (2)
+  meta FOO5 (3)
   priority FOO5 -23
   priority FOO1 -28
 



Re: svn commit: r483650 - in /spamassassin/trunk: MANIFEST lib/Mail/SpamAssassin/Conf.pm lib/Mail/SpamAssassin/Conf/Parser.pm lib/Mail/SpamAssassin/PerMsgStatus.pm masses/mass-check masses/plugins/Dumpmem.pm t/duplicates.t t/priorities.t

Posted by "Daryl C. W. O'Shea" <sp...@dostech.ca>.
jm@apache.org wrote:
> Author: jm
> Date: Thu Dec  7 12:37:31 2006
> New Revision: 483650
> 
> URL: http://svn.apache.org/viewvc?view=rev&rev=483650
> Log:
> bug 5206: detect duplicate rules, and silently merge them internally for greater efficiency.  This results in about 100-120KB RAM usage saving in current svn trunk's ruleset, detecting lots of duplicate rules -- so is well worth doing.  also, change t/priorities.t so it doesn't accidentally confuse itself with duplicate rules

> +sub find_dup_rules {

> +    if (defined $names_for_text{$text}) {
> +      $names_for_text{$text} .= " ".$name;
> +      $dups{$text} = 1;     # found (at least) one
> +    } else {
> +      $names_for_text{$text} = $name;
> +    }
> +  }

undef rather than 1 could have saved you another 8 bytes (total) or so 
in addition to the 100KB you already saved.  :)

In any case, good enhancement... I'm sure there are a LOT of rules that 
overlap with third party rules.


Daryl