You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by he...@apache.org on 2022/05/21 08:51:57 UTC

svn commit: r1901096 - in /spamassassin/trunk: ./ lib/Mail/SpamAssassin/ lib/Mail/SpamAssassin/Conf/ lib/Mail/SpamAssassin/Plugin/ t/

Author: hege
Date: Sat May 21 08:51:57 2022
New Revision: 1901096

URL: http://svn.apache.org/viewvc?rev=1901096&view=rev
Log:
- Named capture cleanups, add tests, new PMS/set_captures, Parser/parse_captures functions (Bug 7992)
- MIMEHeader: support named regex captures, add tflags multiple support, improve tests

Added:
    spamassassin/trunk/t/regexp_named_capture.t   (with props)
Modified:
    spamassassin/trunk/MANIFEST
    spamassassin/trunk/lib/Mail/SpamAssassin/Conf/Parser.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Check.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/MIMEHeader.pm
    spamassassin/trunk/t/mimeheader.t
    spamassassin/trunk/t/testrules.yml

Modified: spamassassin/trunk/MANIFEST
URL: http://svn.apache.org/viewvc/spamassassin/trunk/MANIFEST?rev=1901096&r1=1901095&r2=1901096&view=diff
==============================================================================
--- spamassassin/trunk/MANIFEST (original)
+++ spamassassin/trunk/MANIFEST Sat May 21 08:51:57 2022
@@ -553,6 +553,7 @@ t/re_base_extraction.t
 t/recips.t
 t/recreate.t
 t/recursion.t
+t/regexp_named_capture.t
 t/regexp_valid.t
 t/relative_scores.t
 t/relaycountry.t

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Conf/Parser.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Conf/Parser.pm?rev=1901096&r1=1901095&r2=1901096&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Conf/Parser.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Conf/Parser.pm Sat May 21 08:51:57 2022
@@ -1338,18 +1338,7 @@ sub add_test {
       return;
     }
     $conf->{test_qrs}->{$name} = $rec;
-    # Check for named regex capture templates
-    if (index($rec, '"""') >= 0) {
-      local($1);
-      while ($rec =~ /"""(\w+)"""/g) {
-        $conf->{capture_rules}->{$name}->{$1} = 1;
-      }
-    }
-    # Make rules with captures run before anything else
-    if ($rec =~ /\(\?[<']\w/) {
-      dbg("config: adjusting regex capture rule $name priority to -10000");
-      $conf->{priority}->{$name} = -10000;
-    }
+    $self->parse_captures($name, $rec);
   }
   elsif ($type == $Mail::SpamAssassin::Conf::TYPE_HEAD_TESTS)
   {
@@ -1398,17 +1387,7 @@ sub add_test {
       $conf->{test_qrs}->{$name} = $rec;
       $conf->{test_opt_header}->{$name} = $hdr;
       $conf->{test_opt_neg}->{$name} = 1 if $op eq '!~';
-      # Check for named regex capture templates
-      if (index($rec, '"""') >= 0) {
-        while ($rec =~ /"""(\w+)"""/g) {
-          $conf->{capture_rules}->{$name}->{$1} = 1;
-        }
-      }
-      # Make rules with captures run before anything else
-      if ($rec =~ /\(\?[<']\w/) {
-        dbg("config: adjusting regex capture rule $name priority to -10000");
-        $conf->{priority}->{$name} = -10000;
-      }
+      $self->parse_captures($name, $rec);
     }
   }
   elsif ($type == $Mail::SpamAssassin::Conf::TYPE_META_TESTS)
@@ -1533,6 +1512,23 @@ sub is_meta_valid {
   return 0;
 }
 
+sub parse_captures {
+  my ($self, $name, $re) = @_;
+
+  # Check for named regex capture templates
+  if (index($re, '"""') >= 0) {
+    local($1);
+    while ($re =~ /"""([A-Z][A-Z0-9_]*)"""/g) {
+      $self->{conf}->{capture_rules}->{$name}->{$1} = 1;
+    }
+  }
+  # Make rules with captures run before anything else
+  if ($re =~ /\(\?[<'][A-Z]/) {
+    dbg("config: adjusting regex capture rule $name priority to -10000");
+    $self->{conf}->{priority}->{$name} = -10000;
+  }
+}
+
 # Deprecated functions, leave just in case..
 sub is_delimited_regexp_valid {
   my ($self, $rule, $re) = @_;

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm?rev=1901096&r1=1901095&r2=1901096&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm Sat May 21 08:51:57 2022
@@ -3635,6 +3635,19 @@ sub all_to_addrs {
 
 ###########################################################################
 
+# Save and tag regex named captures, $captures is ref to %- results
+sub set_captures {
+  my ($self, $captures) = @_;
+
+  foreach my $cname (keys %$captures) {
+    my @cvals = do { my %seen; grep { !$seen{$_}++ } @{$captures->{$cname}} };
+    $self->{capture_values}->{$cname} = \@cvals;
+    $self->set_tag($cname, @cvals == 1 ? $cvals[0] : \@cvals);
+  }
+}
+
+###########################################################################
+
 1;
 __END__
 

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Check.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Check.pm?rev=1901096&r1=1901095&r2=1901096&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Check.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Check.pm Sat May 21 08:51:57 2022
@@ -1376,11 +1376,7 @@ sub ran_rule_plugin_code {
   # Set tags from captured values
   my $code = '
     if (%captures) {
-      foreach my $cname (keys %captures) {
-        my @cvals = do { my %seen; grep { !$seen{$_}++ } @{$captures{$cname}} };
-        $self->{capture_values}->{$cname} = \@cvals;
-        $self->set_tag($cname, @cvals == 1 ? $cvals[0] : \@cvals);
-      }
+      $self->set_captures(\%captures);
       %captures = ();
     }
   ';

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/MIMEHeader.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/MIMEHeader.pm?rev=1901096&r1=1901095&r2=1901096&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/MIMEHeader.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/MIMEHeader.pm Sat May 21 08:51:57 2022
@@ -63,7 +63,7 @@ Part 1 = main message headers. Part 2 =
 
 =items tflags NAME_OF_RULE concat
 
-Concat all headers from all mime parts (possible range applied) into a
+Concatenate all headers from all mime parts (possible range applied) into a
 single string for matching.  This allows matching headers across multiple
 parts with single regex.  Normally pattern is tested individually for
 different mime parts.
@@ -168,6 +168,9 @@ sub set_config {
       $self->{parser}->add_test($rulename, $evalfn."()",
                 $Mail::SpamAssassin::Conf::TYPE_BODY_EVALS);
 
+      # Support named regex captures
+      $self->{parser}->parse_captures($rulename, $rec);
+
       # evalfn/rulename safe, sanitized by $RULENAME_RE
       my $evalcode = '
         sub Mail::SpamAssassin::Plugin::MIMEHeader::'.$evalfn.' {
@@ -195,25 +198,23 @@ sub set_config {
 # ---------------------------------------------------------------------------
 
 sub eval_hook_called {
-  my ($pobj, $scanner, $rulename) = @_;
+  my ($pobj, $pms, $rulename) = @_;
 
-  my $rule = $scanner->{conf}->{mimeheader_tests}->{$rulename};
+  my $conf = $pms->{conf};
+  my $rule = $conf->{mimeheader_tests}->{$rulename};
   my $hdr = $rule->{hdr};
   my $negated = $rule->{negated};
-  my $if_unset = $rule->{if_unset};
   my $pattern = $rule->{pattern};
-
-
-  my $getraw;
+  my $tflags = $conf->{tflags}->{$rulename}||'';
+  
+  my $getraw = 0;
   if ($hdr =~ s/:raw$//) {
     $getraw = 1;
-  } else {
-    $getraw = 0;
   }
 
   my $range_min = 0;
   my $range_max = 1000;
-  if (($scanner->{conf}->{tflags}->{$rulename}||'') =~ /(?:^|\s)range=(\d+)?(-)?(\d+)?(?:\s|$)/) {
+  if ($tflags =~ /(?:^|\s)range=(\d+)?(-)?(\d+)?(?:\s|$)/) {
     if (defined $1 && defined $2 && defined $3) {
       $range_min = $1;
       $range_max = $3;
@@ -229,11 +230,14 @@ sub eval_hook_called {
     }
   }
 
-  my $concat = ($scanner->{conf}->{tflags}->{$rulename}||'') =~ /\bconcat\b/;
+  my $multiple = $tflags =~ /\bmultiple\b/;
+  my $concat = $tflags =~ /\bconcat\b/;
+  my $maxhits = $tflags =~ /\bmaxhits=(\d+)\b/ ? $1 :
+                           $multiple ? 1000 : 1;
   my $cval = '';
 
   my $idx = 0;
-  foreach my $p ($scanner->{msg}->find_parts(qr/./)) {
+  foreach my $p ($pms->{msg}->find_parts(qr/./)) {
     $idx++;
     last if $idx > $range_max;
     next if $idx < $range_min;
@@ -241,13 +245,12 @@ sub eval_hook_called {
     my $val;
     if ($hdr eq 'ALL') {
       $val = $p->get_all_headers($getraw, 0);
-    }
-    elsif ($getraw) {
+    } elsif ($getraw) {
       $val = $p->raw_header($hdr);
     } else {
       $val = $p->get_header($hdr);
     }
-    $val = $if_unset if !defined $val;
+    $val = $rule->{if_unset}  if !defined $val;
 
     if ($concat) {
       $val .= "\n" unless $val =~ /\n$/;
@@ -255,19 +258,14 @@ sub eval_hook_called {
       next;
     }
 
-    if ($val =~ /$pattern/p) {
-      next if $negated;
-      my $match = defined ${^MATCH} ? ${^MATCH} : "<negative match>";
-      dbg("mimeheader: ran rule $rulename ======> got hit: \"$match\" (part $idx)");
-      return 1;
+    if (_check($pms, $rulename, $val, $pattern, $negated, $maxhits, "part $idx")) {
+      return 0;
     }
   }
 
   if ($concat) {
-    if (!$negated && $cval =~ /$pattern/p) {
-      my $match = defined ${^MATCH} ? ${^MATCH} : "<negative match>";
-      dbg("mimeheader: ran rule $rulename ======> got hit: \"$match\" (concat)");
-      return 1;
+    if (_check($pms, $rulename, $cval, $pattern, $negated, $maxhits, 'concat')) {
+      return 0;
     }
   }
 
@@ -279,6 +277,27 @@ sub eval_hook_called {
   return 0;
 }
 
+sub _check {
+  my ($pms, $rulename, $value, $pattern, $negated, $maxhits, $desc) = @_;
+
+  my $hits = 0;
+  my %captures;
+  while ($value =~ /$pattern/gp) {
+    last if $negated;
+    if (%-) {
+      foreach my $cname (keys %-) {
+        push @{$captures{$cname}}, grep { $_ ne "" } @{$-{$cname}};
+      }
+    }
+    my $match = defined ${^MATCH} ? ${^MATCH} : "<negative match>";
+    $pms->got_hit($rulename, '', ruletype => 'eval');
+    dbg("mimeheader: ran rule $rulename ======> got hit: \"$match\" ($desc)");
+    last if ++$hits >= $maxhits;
+  }
+  $pms->set_captures(\%captures) if %captures;
+  return $hits;
+}
+
 # ---------------------------------------------------------------------------
 
 sub finish_tests {
@@ -294,6 +313,8 @@ sub finish_tests {
 
 sub has_all_header { 1 } # Supports ALL header query (Bug 5582)
 sub has_tflags_range { 1 } # Supports tflags range=x-y
-sub has_tflags_concat { 1 } # Support tflags concat
+sub has_tflags_concat { 1 } # Supports tflags concat
+sub has_tflags_multiple { 1 } # Supports tflags multiple
+sub has_capture_rules { 1 } # Supports named regex captures (Bug 7992)
 
 1;

Modified: spamassassin/trunk/t/mimeheader.t
URL: http://svn.apache.org/viewvc/spamassassin/trunk/t/mimeheader.t?rev=1901096&r1=1901095&r2=1901096&view=diff
==============================================================================
--- spamassassin/trunk/t/mimeheader.t (original)
+++ spamassassin/trunk/t/mimeheader.t Sat May 21 08:51:57 2022
@@ -2,26 +2,32 @@
 
 use lib '.'; use lib 't';
 use SATest; sa_t_init("mimeheader");
-use Test::More tests => 12;
+use Test::More tests => 18;
 
 # ---------------------------------------------------------------------------
 
 %patterns = (
-  q{ 1.0 MIMEHEADER_TEST1 }, q{ test1 },
-  q{ 1.0 MIMEHEADER_TEST2 }, q{ test2 },
-  q{ 1.0 MATCH_NL_NONRAW }, q{ match_nl_nonraw },
-  q{ 1.0 MATCH_NL_RAW }, q{ match_nl_raw },
-  q{ 1.0 MIMEHEADER_FOUND1 }, q{ unset_found },
-  q{ 1.0 MIMEHEADER_FOUND2 }, q{ negate_found },
-  q{ 1.0 MIMEHEADER_CONCAT1 }, q{ concat1_found },
-  q{ 1.0 MIMEHEADER_RANGE1 }, q{ range1_found },
-  q{ 1.0 MIMEHEADER_RANGE2 }, q{ range2_found },
-  q{ 1.0 MIMEHEADER_RANGE3 }, q{ range3_found },
-  q{ 1.0 MIMEHEADER_RANGE4 }, q{ range4_found },
+  q{ 1.0 MIMEHEADER_TEST1 }, '',
+  q{ 1.0 MIMEHEADER_TEST2 }, '',
+  q{ 1.0 MATCH_NL_NONRAW }, '',
+  q{ 1.0 MATCH_NL_RAW }, '',
+  q{ 1.0 MIMEHEADER_FOUND1 }, '',
+  q{ 1.0 MIMEHEADER_FOUND2 }, '',
+  q{ 1.0 MIMEHEADER_CONCAT1 }, '',
+  q{ 1.0 MIMEHEADER_RANGE1 }, '',
+  q{ 1.0 MIMEHEADER_RANGE2 }, '',
+  q{ 1.0 MIMEHEADER_RANGE3 }, '',
+  q{ 1.0 MIMEHEADER_RANGE4 }, '',
+  q{ 1.0 MIMEHEADER_MULTI1 }, '',
+  q{ 1.0 MIMEHEADER_MULTIMETA1 }, '',
+  q{ 1.0 MIMEHEADER_MULTI2 }, '',
+  q{ 1.0 MIMEHEADER_MULTIMETA2 }, '',
+  q{ 1.0 MIMEHEADER_CAPTURE1 }, '',
+  q{/tag MIMECAP1 is now ready, value: text/plain\n/}, '',
 );
 
 %anti_patterns = (
-  q{ MIMEHEADER_NOTFOUND }, q{ notfound },
+  q{ MIMEHEADER_NOTFOUND }, '',
 );
 
 tstprefs (q{
@@ -51,8 +57,19 @@ tstprefs (q{
   mimeheader MIMEHEADER_RANGE4 Content-Type =~ /Jurek/
   tflags MIMEHEADER_RANGE4 range=-10
 
+  # multiple
+  mimeheader MIMEHEADER_MULTI1 Content-Type =~ /-[82]/ # iso-8859-2, two matches
+  tflags MIMEHEADER_MULTI1 multiple
+  meta MIMEHEADER_MULTIMETA1 MIMEHEADER_MULTI1 == 2
+  mimeheader MIMEHEADER_MULTI2 ALL =~ /^X-/m # Count X- starting headers
+  tflags MIMEHEADER_MULTI2 multiple
+  meta MIMEHEADER_MULTIMETA2 MIMEHEADER_MULTI2 == 4
+
+  # named regex capture
+  mimeheader MIMEHEADER_CAPTURE1 Content-Type =~ /(?<MIMECAP1>text\/\w+)/
 });
 
-sarun ("-D mimeheader -L -t < data/nice/004 2>&1", \&patterns_run_cb);
+# Check debug needed for tag check
+sarun ("-D check -L -t < data/nice/004 2>&1", \&patterns_run_cb);
 ok_all_patterns();
 

Added: spamassassin/trunk/t/regexp_named_capture.t
URL: http://svn.apache.org/viewvc/spamassassin/trunk/t/regexp_named_capture.t?rev=1901096&view=auto
==============================================================================
--- spamassassin/trunk/t/regexp_named_capture.t (added)
+++ spamassassin/trunk/t/regexp_named_capture.t Sat May 21 08:51:57 2022
@@ -0,0 +1,36 @@
+#!/usr/bin/perl -T
+
+use lib '.'; 
+use lib 't';
+use SATest; sa_t_init("regexp_named_capture");
+
+use Test::More;
+plan tests => 10;
+
+# ---------------------------------------------------------------------------
+
+%patterns = (
+  q{ TEST_CAPTURE_1 } => '',
+  q{ TEST_CAPTURE_2 } => '',
+  q{ TEST_CAPTURE_3 } => '',
+  q{ TEST_CAPTURE_4 } => '',
+  q{ TEST_CAPTURE_5 } => '',
+  q{/tag TESTCAP1 is now ready, value: Ximian\n/} => '',
+  q{/tag TESTCAP2 is now ready, value: Ximian\n/} => '',
+  q{/tag TESTCAP3 is now ready, value: gnome.org\n/} => '',
+  q{/tag TESTCAP4 is now ready, value: milkplus\n/} => '',
+  q{/tag TESTCAP5 is now ready, value: release\n/} => '',
+);
+%anti_patterns = ();
+
+tstlocalrules (q{
+   body TEST_CAPTURE_1 /release of (?<TESTCAP1>\w+)/
+   rawbody TEST_CAPTURE_2 /release of (?<TESTCAP2>\w+)/
+   uri TEST_CAPTURE_3 /ftp\.(?<TESTCAP3>[\w.]+)/
+   header TEST_CAPTURE_4 Message-ID =~ /@(?<TESTCAP4>\w+)/
+   full TEST_CAPTURE_5 /X-Spam-Status.* preview (?<TESTCAP5>\w+)/s
+});
+
+sarun ("-D check -L -t < data/nice/001 2>&1", \&patterns_run_cb);
+ok_all_patterns();
+

Propchange: spamassassin/trunk/t/regexp_named_capture.t
------------------------------------------------------------------------------
    svn:executable = *

Modified: spamassassin/trunk/t/testrules.yml
URL: http://svn.apache.org/viewvc/spamassassin/trunk/t/testrules.yml?rev=1901096&r1=1901095&r2=1901096&view=diff
==============================================================================
--- spamassassin/trunk/t/testrules.yml (original)
+++ spamassassin/trunk/t/testrules.yml Sat May 21 08:51:57 2022
@@ -9,6 +9,7 @@ seq:
     - t/uri*.t
     - t/get*.t
     - t/header*.t
+    - t/regexp*.t
     - t/*dns*.t
     - t/rule*.t
   # tests that are not parallel-ready (will run in isolation)