You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by km...@apache.org on 2018/12/14 22:22:50 UTC
svn commit: r1848970 - in /spamassassin: branches/3.4/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm trunk/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm

Author: kmcgrail
Date: Fri Dec 14 22:22:49 2018
New Revision: 1848970

URL: http://svn.apache.org/viewvc?rev=1848970&view=rev
Log:
Optimize extract of body rules during sa-compile - Bug 7665

Modified:
    spamassassin/branches/3.4/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm

Modified: spamassassin/branches/3.4/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/3.4/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm?rev=1848970&r1=1848969&r2=1848970&view=diff
==============================================================================
--- spamassassin/branches/3.4/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm (original)
+++ spamassassin/branches/3.4/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm Fri Dec 14 22:22:49 2018
@@ -40,6 +40,16 @@ use warnings;
 # use bytes;
 use re 'taint';
 
+# Not a constant hashref for 5.6 compat
+use constant SLOT_BASE => 0;
+use constant SLOT_NAME => 1;
+use constant SLOT_ORIG => 2;
+use constant SLOT_LEN_BASE => 3;
+use constant SLOT_BASE_INITIAL => 4;
+use constant SLOT_HAS_MULTIPLE => 5;
+
+use constant CLOBBER => '';
+
 our @ISA = qw(Mail::SpamAssassin::Plugin);
 
 use constant DEBUG_RE_PARSING => 0;     # noisy!
@@ -299,66 +309,71 @@ NEXT_RULE:
   # this bit is annoyingly O(N^2).  Rewrite the data -- the @good_bases
   # array -- into a more efficient format, using arrays and with a little
   # bit of precomputation, to go (quite a bit) faster
-
   my @rewritten;
   foreach my $set1 (@good_bases) {
     my $base = $set1->{base};
     next if (!$base || !$set1->{name});
     push @rewritten, [
-      $base,                # 0
-      $set1->{name},        # 1
-      $set1->{orig},        # 2
-      length $base,         # 3
-      qr/\Q$base\E/,        # 4
-      0                     # 5, has_multiple flag
+      $base,                # 0 - SLOT_BASE
+      $set1->{name},        # 1 - SLOT_NAME
+      $set1->{orig},        # 2 - SLOT_ORIG
+      length $base,         # 3 - SLOT_LEN_BASE
+      $base,                # 4 - SLOT_BASE_INITIAL
+      0                     # 5 - SLOT_HAS_MULTIPLE, has_multiple flag
     ];
   }
-  @good_bases = @rewritten;
 
-  foreach my $set1 (@good_bases) {
-    $self->{show_progress} and $progress and $progress->update(++$count);
+  @good_bases = sort {
+    $b->[SLOT_LEN_BASE] <=> $a->[SLOT_LEN_BASE] ||
+    $a->[SLOT_BASE] cmp $b->[SLOT_BASE] ||
+    $a->[SLOT_NAME] cmp $b->[SLOT_NAME] ||
+    $a->[SLOT_ORIG] cmp $b->[SLOT_ORIG]
+  } @rewritten;
 
-    my $base1 = $set1->[0]; next unless $base1;
-    my $name1 = $set1->[1];
-    my $orig1 = $set1->[2];
-    $conf->{base_orig}->{$ruletype}->{$name1} = $orig1;
-    my $len1 = $set1->[3];
 
-    foreach my $set2 (@good_bases) {
-      next if ($set1 == $set2);
-
-      my $base2 = $set2->[0]; next unless $base2;
-      my $name2 = $set2->[1];
+  my $base_orig =  $conf->{base_orig}->{$ruletype};
+  my $next_base_position = 0;
+  for my $set1 (@good_bases) {
+    $next_base_position++;
+    $self->{show_progress} and $progress and $progress->update(++$count);
+    my $base1 = $set1->[SLOT_BASE] or next;  # got clobbered
+    my $name1 = $set1->[SLOT_NAME];
+    my $orig1 = $set1->[SLOT_ORIG];
+    my $len1 = $set1->[SLOT_LEN_BASE];
+    $base_orig->{$name1} = $orig1;
 
-      # clobber exact dups; this can happen if a regexp outputs the 
+    foreach my $set2 (@good_bases[$next_base_position .. $#good_bases]) { # order from smallest to largest
+      # clobber false and exact dups; this can happen if a regexp outputs the
       # same base string multiple times
-      if ($base1 eq $base2 &&
-          $name1 eq $name2 &&
-          $orig1 eq $set2->[2])
+      if (!$set2->[SLOT_BASE] ||
+		(
+		  $base1 eq $set2->[SLOT_BASE] &&
+		  $name1 eq $set2->[SLOT_NAME] &&
+		  $orig1 eq $set2->[SLOT_ORIG]
+		)
+	)
       {
-        $set2->[0] = '';       # clobber
+        #dbg("clobbering: [base2][$set2->[SLOT_BASE]][name2][$set2->[SLOT_NAME]][orig][$set2->[SLOT_ORIG]]");
+        $set2->[SLOT_BASE] = CLOBBER;       # clobber
         next;
       }
 
-      # skip if it's too short to contain the other base string
-      next if ($len1 < $set2->[3]);
+      # Cannot be a subset if it does not contain the other base string
+      next if index($base1,$set2->[SLOT_BASE_INITIAL]) == -1;
 
       # skip if either already contains the other rule's name
       # optimize: this can only happen if the base has more than
       # one rule already attached, ie [5]
-      next if ($set2->[5] && $name2 =~ /(?: |^)\Q$name1\E(?: |$)/);
+      next if ($set2->[SLOT_HAS_MULTIPLE] && index($set2->[SLOT_NAME],$name1) > -1 && $set2->[SLOT_NAME] =~ /(?: |^)\Q$name1\E(?: |$)/);
 
       # don't use $name1 here, since another base in the set2 loop
       # may have added $name2 since we set that
-      next if ($set1->[5] && $set1->[1] =~ /(?: |^)\Q$name2\E(?: |$)/);
+      next if ($set1->[SLOT_HAS_MULTIPLE] && index($set1->[SLOT_NAME],$set2->[SLOT_NAME]) > -1 && $set1->[SLOT_NAME] =~ /(?: |^)\Q$set2->[SLOT_NAME]\E(?: |$)/);
 
-      # and finally check to see if it *does* contain the other base string
-      next if ($base1 !~ $set2->[4]);
-
-      # base2 is just a subset of base1
-      # dbg("zoom: subsuming '$base2' ($name2) into '$base1': [1]=$set1->[1] [5]=$set1->[5]");
-      $set1->[1] .= " ".$name2;
-      $set1->[5] = 1;
+      # $set2->[SLOT_BASE] is just a subset of base1
+      #dbg("zoom: subsuming '$set2->[SLOT_BASE]' ($set2->[SLOT_NAME]) into '$base1': [SLOT_BASE]=$set1->[SLOT_BASE] [SLOT_HAS_MULTIPLE]=$set1->[SLOT_HAS_MULTIPLE]");
+      $set1->[SLOT_NAME] .= " ".$set2->[SLOT_NAME];
+      $set1->[SLOT_HAS_MULTIPLE] = 1;
     }
   }
 
@@ -378,14 +393,16 @@ NEXT_RULE:
   }
   undef @good_bases;
 
+  my $base_string =  $conf->{base_string}->{$ruletype};
   foreach my $base (keys %bases) {
     # uniq the list, since there are probably dup rules listed
     my %u;
     for my $i (split ' ', $bases{$base}) {
       next if exists $u{$i}; undef $u{$i}; 
     }
-    $conf->{base_string}->{$ruletype}->{$base} = join ' ', sort keys %u;
+    $base_string->{$base} = join ' ', sort keys %u;
   }
+
   $self->{show_progress} and $progress and $progress->final();
 
   if ($cachefile) {

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm?rev=1848970&r1=1848969&r2=1848970&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm Fri Dec 14 22:22:49 2018
@@ -40,6 +40,16 @@ use warnings;
 # use bytes;
 use re 'taint';
 
+# Not a constant hashref for 5.6 compat
+use constant SLOT_BASE => 0;
+use constant SLOT_NAME => 1;
+use constant SLOT_ORIG => 2;
+use constant SLOT_LEN_BASE => 3;
+use constant SLOT_BASE_INITIAL => 4;
+use constant SLOT_HAS_MULTIPLE => 5;
+
+use constant CLOBBER => '';
+
 our @ISA = qw(Mail::SpamAssassin::Plugin);
 
 use constant DEBUG_RE_PARSING => 0;     # noisy!
@@ -295,66 +305,71 @@ NEXT_RULE:
   # this bit is annoyingly O(N^2).  Rewrite the data -- the @good_bases
   # array -- into a more efficient format, using arrays and with a little
   # bit of precomputation, to go (quite a bit) faster
-
   my @rewritten;
   foreach my $set1 (@good_bases) {
     my $base = $set1->{base};
     next if (!$base || !$set1->{name});
     push @rewritten, [
-      $base,                # 0
-      $set1->{name},        # 1
-      $set1->{orig},        # 2
-      length $base,         # 3
-      qr/\Q$base\E/,        # 4
-      0                     # 5, has_multiple flag
+      $base,                # 0 - SLOT_BASE
+      $set1->{name},        # 1 - SLOT_NAME
+      $set1->{orig},        # 2 - SLOT_ORIG
+      length $base,         # 3 - SLOT_LEN_BASE
+      $base,                # 4 - SLOT_BASE_INITIAL
+      0                     # 5 - SLOT_HAS_MULTIPLE, has_multiple flag
     ];
   }
-  @good_bases = @rewritten;
 
-  foreach my $set1 (@good_bases) {
-    $self->{show_progress} and $progress and $progress->update(++$count);
+  @good_bases = sort {
+    $b->[SLOT_LEN_BASE] <=> $a->[SLOT_LEN_BASE] ||
+    $a->[SLOT_BASE] cmp $b->[SLOT_BASE] ||
+    $a->[SLOT_NAME] cmp $b->[SLOT_NAME] ||
+    $a->[SLOT_ORIG] cmp $b->[SLOT_ORIG]
+  } @rewritten;
 
-    my $base1 = $set1->[0]; next unless $base1;
-    my $name1 = $set1->[1];
-    my $orig1 = $set1->[2];
-    $conf->{base_orig}->{$ruletype}->{$name1} = $orig1;
-    my $len1 = $set1->[3];
 
-    foreach my $set2 (@good_bases) {
-      next if ($set1 == $set2);
-
-      my $base2 = $set2->[0]; next unless $base2;
-      my $name2 = $set2->[1];
+  my $base_orig =  $conf->{base_orig}->{$ruletype};
+  my $next_base_position = 0;
+  for my $set1 (@good_bases) {
+    $next_base_position++;
+    $self->{show_progress} and $progress and $progress->update(++$count);
+    my $base1 = $set1->[SLOT_BASE] or next;  # got clobbered
+    my $name1 = $set1->[SLOT_NAME];
+    my $orig1 = $set1->[SLOT_ORIG];
+    my $len1 = $set1->[SLOT_LEN_BASE];
+    $base_orig->{$name1} = $orig1;
 
-      # clobber exact dups; this can happen if a regexp outputs the 
+    foreach my $set2 (@good_bases[$next_base_position .. $#good_bases]) { # order from smallest to largest
+      # clobber false and exact dups; this can happen if a regexp outputs the
       # same base string multiple times
-      if ($base1 eq $base2 &&
-          $name1 eq $name2 &&
-          $orig1 eq $set2->[2])
+      if (!$set2->[SLOT_BASE] ||
+		(
+		  $base1 eq $set2->[SLOT_BASE] &&
+		  $name1 eq $set2->[SLOT_NAME] &&
+		  $orig1 eq $set2->[SLOT_ORIG]
+		)
+	)
       {
-        $set2->[0] = '';       # clobber
+        #dbg("clobbering: [base2][$set2->[SLOT_BASE]][name2][$set2->[SLOT_NAME]][orig][$set2->[SLOT_ORIG]]");
+        $set2->[SLOT_BASE] = CLOBBER;       # clobber
         next;
       }
 
-      # skip if it's too short to contain the other base string
-      next if ($len1 < $set2->[3]);
+      # Cannot be a subset if it does not contain the other base string
+      next if index($base1,$set2->[SLOT_BASE_INITIAL]) == -1;
 
       # skip if either already contains the other rule's name
       # optimize: this can only happen if the base has more than
       # one rule already attached, ie [5]
-      next if ($set2->[5] && $name2 =~ /(?: |^)\Q$name1\E(?: |$)/);
+      next if ($set2->[SLOT_HAS_MULTIPLE] && index($set2->[SLOT_NAME],$name1) > -1 && $set2->[SLOT_NAME] =~ /(?: |^)\Q$name1\E(?: |$)/);
 
       # don't use $name1 here, since another base in the set2 loop
       # may have added $name2 since we set that
-      next if ($set1->[5] && $set1->[1] =~ /(?: |^)\Q$name2\E(?: |$)/);
+      next if ($set1->[SLOT_HAS_MULTIPLE] && index($set1->[SLOT_NAME],$set2->[SLOT_NAME]) > -1 && $set1->[SLOT_NAME] =~ /(?: |^)\Q$set2->[SLOT_NAME]\E(?: |$)/);
 
-      # and finally check to see if it *does* contain the other base string
-      next if ($base1 !~ $set2->[4]);
-
-      # base2 is just a subset of base1
-      # dbg("zoom: subsuming '$base2' ($name2) into '$base1': [1]=$set1->[1] [5]=$set1->[5]");
-      $set1->[1] .= " ".$name2;
-      $set1->[5] = 1;
+      # $set2->[SLOT_BASE] is just a subset of base1
+      #dbg("zoom: subsuming '$set2->[SLOT_BASE]' ($set2->[SLOT_NAME]) into '$base1': [SLOT_BASE]=$set1->[SLOT_BASE] [SLOT_HAS_MULTIPLE]=$set1->[SLOT_HAS_MULTIPLE]");
+      $set1->[SLOT_NAME] .= " ".$set2->[SLOT_NAME];
+      $set1->[SLOT_HAS_MULTIPLE] = 1;
     }
   }
 
@@ -374,14 +389,16 @@ NEXT_RULE:
   }
   undef @good_bases;
 
+  my $base_string =  $conf->{base_string}->{$ruletype};
   foreach my $base (keys %bases) {
     # uniq the list, since there are probably dup rules listed
     my %u;
     for my $i (split ' ', $bases{$base}) {
       next if exists $u{$i}; undef $u{$i}; 
     }
-    $conf->{base_string}->{$ruletype}->{$base} = join ' ', sort keys %u;
+    $base_string->{$base} = join ' ', sort keys %u;
   }
+
   $self->{show_progress} and $progress and $progress->final();
 
   if ($cachefile) {