You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2006/10/27 17:00:47 UTC

svn commit: r468404 - /spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm

Author: jm
Date: Fri Oct 27 08:00:46 2006
New Revision: 468404

URL: http://svn.apache.org/viewvc?view=rev&rev=468404
Log:
support alternation splitting, too

Modified:
    spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm

Modified: spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm?view=diff&rev=468404&r1=468403&r2=468404
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm (original)
+++ spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm Fri Oct 27 08:00:46 2006
@@ -55,7 +55,7 @@
   my $self = $class->SUPER::new($mailsaobject);
   bless ($self, $class);
 
-  # $self->test();
+  # $self->test(); exit;
   return $self;
 }
 
@@ -130,9 +130,11 @@
     eval {  # catch die()s
       @bases1 = $self->extract_hints($rule, 0);
     };
+    $@ and dbg("giving up on that direction: $@");
     eval {
       @bases2 = $self->extract_hints($rule, 1);
     };
+    $@ and dbg("giving up on that direction: $@");
 
     # if any of the extracted hints in a set are too short, the entire
     # set is invalid; this is because each set of N hints represents just
@@ -376,6 +378,14 @@
               \]
             ).*$//gsx;
 
+  if ($BASES_CAN_USE_ALTERNATIONS||$SPLIT_OUT_ALTERNATIONS) {
+    # /foo (bar)? baz/ simplify to /foo (bar|) baz/
+    $rule =~ s/(?<!\\)(\([^\(\)]*)\)\?/$1\|\)/gs;
+
+    # /foo bar? baz/ simplify to /foo ba(r|) baz/
+    $rule =~ s/(?<!\\)(.)\?/($1\|\)/gs;
+  }
+
   $BASES_CAN_USE_QUANTIFIERS or $rule =~ s/(?<!\\)(?:
               .\*|	# remove the quantified char, too
               .\+|
@@ -434,7 +444,7 @@
   $rule =~ s/\\w/[_a-z0-9]/gs;
   $rule =~ s/\\W/[^_a-z0-9]/gs;
 
-  # loop here, to catch __DRUGS_SLEEP1:
+  # {loop here, to catch __DRUGS_SLEEP1:
   # 0,3}([ \t\n]|z{0})
   while (1) 
   {
@@ -446,7 +456,6 @@
     if ($rule =~ /^\((?:
               \.?[\*\?\+] |
               \.?\{?[^\{]*\} |
-              [^\(]*\) |
               \[ |
               [^\[]*\]
             )/sx)
@@ -477,7 +486,6 @@
     last if $startrule eq $rule;
   }
 
-
   # return for things we know we can't handle.
   if (!($BASES_CAN_USE_ALTERNATIONS||$SPLIT_OUT_ALTERNATIONS)) {
     if ($rule =~ /\|/) {
@@ -486,6 +494,7 @@
     }
   }
 
+
   {
     # count (...braces...) to ensure the numbers match up
     my @c = ($rule =~ /(?<!\\)\(/g); my $brace_i = scalar @c;
@@ -601,7 +610,7 @@
   # trim unnecessary group markers, e.g. /f(oo)/ => /foo/
   $re =~ s/\(([^\(\)\|]*)\)/$1/gs;
 
-  # identify the smallest nested (...|...) scope
+  # identify the deepest-nested (...|...) scope
   $re =~ m{
       ^(.*)
       (?<!\\)\(([^\(\)]*?\|[^\(\)]*?)\)
@@ -619,7 +628,10 @@
 
   # and expand it
   my @out = ();
-  foreach my $str (split (/(?<!\\)\|/, $alts)) {
+
+  # the 999999 actually does have an effect; otherwise '(foo|)' is
+  # split as ('foo') instead of ('foo', '') for some reason
+  foreach my $str (split (/(?<!\\)\|/, $alts, 999999)) {
     $str = $pre.$str.$post;
     # are there unresolved groups left?
     if ($str =~ /(?<!\\)[\(\|\)]/) {
@@ -646,12 +658,14 @@
   $self->test_split_alt("foo", "/foo/");
   $self->test_split_alt("(foo)", "/foo/");
   $self->test_split_alt("foo(bar)baz", "/foobarbaz/");
+  $self->test_split_alt("x(foo|)", "/xfoo/ /x/");
+  $self->test_split_alt("fo(o|)", "/foo/ /fo/");
   $self->test_split_alt("(foo|bar)", "/foo/ /bar/");
   $self->test_split_alt("foo|bar", "/foo/ /bar/");
   $self->test_split_alt("foo (bar|baz) argh", "/foo bar argh/ /foo baz argh/");
   $self->test_split_alt("foo (bar|baz|bl(arg|at)) cough", "/foo bar cough/ /foo baz cough/ /foo blarg cough/ /foo blat cough/");
   $self->test_split_alt("(s(otc|tco)k)", "/sotck/ /stcok/");
-  exit;
+  $self->test_split_alt("(business partner(s|ship|)|silent partner(s|ship|))", "/business partners/ /silent partners/ /business partnership/ /silent partnership/ /business partner/ /silent partner/");
 }
 
 sub test_split_alt {