You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2006/11/16 15:03:03 UTC

svn commit: r475742 - in /spamassassin/branches/jm_re2c_hacks: lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm t/re_base_extraction.t

Author: jm
Date: Thu Nov 16 06:03:03 2006
New Revision: 475742

URL: http://svn.apache.org/viewvc?view=rev&rev=475742
Log:
fix base extraction on FUZZY_ rules; exclusions should be ignored

Modified:
    spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm
    spamassassin/branches/jm_re2c_hacks/t/re_base_extraction.t

Modified: spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm?view=diff&rev=475742&r1=475741&r2=475742
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm (original)
+++ spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm Thu Nov 16 06:03:03 2006
@@ -74,9 +74,7 @@
   my ($self, $conf) = @_;
 
   my $main = $conf->{main};
-  if (!$main->{base_extract}) {
-    return;         # TODO: comment this for Rabin-Karp
-  }
+  if (!$main->{base_extract}) { return; }
 
   $self->extract_set($conf, $conf->{body_tests}, 'body');
 }
@@ -313,6 +311,9 @@
   $rule =~ s/\(\\b\|\^\)//gs;
   $rule =~ s/\(\\b\|\$\)//gs;
 
+  # remove (?!credit)
+  $rule =~ s/\(\?\![^\)]+\)//gs;
+
   # remove \b's
   $rule =~ s/\\b//gs;
 
@@ -386,13 +387,18 @@
   # still problematic; kill all "x?" statements
   $rule =~ s/.\?.*$//gsx;
 
-  # simplify (..)? and (..|) to (..|z{0})
-  # this wierd construct is to work around an re2c bug; (..|) doesn't
-  # do what it should
   if ($main->{bases_can_use_alternations}) {
-    $rule =~ s/\((.*?)\)\?/\($1\|z{0}\)/gs;
-    $rule =~ s/\((.*?)\|\)/\($1\|z{0}\)/gs;
-    $rule =~ s/\(\|(.*?)\)/\($1\|z{0}\)/gs;
+    $rule =~ s/\((.*?)\)\?/\($1\|\)/gs;
+    $rule =~ s/\((.*?)\|\)/\($1\|\)/gs;
+    $rule =~ s/\(\|(.*?)\)/\($1\|\)/gs;
+
+    # simplify (..)? and (..|) to (..|z{0}); this wierd construct is to work
+    # around an re2c bug; (..|) doesn't do what it should. off for now; re2c's
+    # alt support isn't actually usable anyway due to bugs with how it handles
+    # overlapping patterns.
+    #$rule =~ s/\((.*?)\)\?/\($1\|z{0}\)/gs;
+    #$rule =~ s/\((.*?)\|\)/\($1\|z{0}\)/gs;
+    #$rule =~ s/\(\|(.*?)\)/\($1\|z{0}\)/gs;
   }
 
   # re2xs doesn't like escaped brackets;
@@ -475,21 +481,21 @@
     # count (...braces...) to ensure the numbers match up
     my @c = ($rule =~ /(?<!\\)\(/g); my $brace_i = scalar @c;
        @c = ($rule =~ /(?<!\\)\)/g); my $brace_o = scalar @c;
-    if ($brace_i != $brace_o) { die "brace mismatch"; }
+    if ($brace_i != $brace_o) { die "brace mismatch in '$rule'"; }
   }
 
   # do the same for [charclasses]
   {
     my @c = ($rule =~ /(?<!\\)\[/g); my $brace_i = scalar @c;
        @c = ($rule =~ /(?<!\\)\]/g); my $brace_o = scalar @c;
-    if ($brace_i != $brace_o) { die "charclass mismatch"; }
+    if ($brace_i != $brace_o) { die "charclass mismatch in '$rule'"; }
   }
 
   # and {quantifiers}
   {
     my @c = ($rule =~ /(?<!\\)\{/g); my $brace_i = scalar @c;
        @c = ($rule =~ /(?<!\\)\}/g); my $brace_o = scalar @c;
-    if ($brace_i != $brace_o) { die "quantifier mismatch"; }
+    if ($brace_i != $brace_o) { die "quantifier mismatch in '$rule'"; }
   }
 
   # lookaheads that are just too far for the re2c parser

Modified: spamassassin/branches/jm_re2c_hacks/t/re_base_extraction.t
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/t/re_base_extraction.t?view=diff&rev=475742&r1=475741&r2=475742
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/t/re_base_extraction.t (original)
+++ spamassassin/branches/jm_re2c_hacks/t/re_base_extraction.t Thu Nov 16 06:03:03 2006
@@ -13,7 +13,7 @@
   if (-e 't/test_dir') { chdir 't'; } 
   if (-e 'test_dir') { unshift(@INC, '../blib/lib'); }
 
-  plan tests => 20;
+  plan tests => 25;
 
 };
 use lib '../lib';
@@ -27,7 +27,9 @@
     body TEST3 /foody? bar/
     body TEST4 /A(?i:ct) N(?i:ow)/
     body TEST5 /time to refinance|refinanc\w{1,3}\b.{0,16}\bnow\b/i
-    # body TEST6 /(?:Current|Target)(?: Price)?:\s+\$(?:O\.|\d\.O)/
+    body TEST6 /(?:Current|Target)(?: Price)?:\s+\$(?:O\.|\d\.O)/
+    body TEST7 /(?!credit)[ck\xc7\xe7@]\W?r\W?[e3\xc8\xc9\xca\xcb\xe8\xe9\xea\xeb\xa4]\W?[d\xd0]\W?[il|!1y?\xcc\xcd\xce\xcf\xec\xed\xee\xef]\W?t/i
+
 
 ', {
     base_extract => 1,
@@ -49,7 +51,15 @@
     'foody bar:TEST3 TEST2',
     'refinanc:TEST5',
     'time to refinance:TEST5',
+    'target:TEST6',
+    'target price:TEST6',
+    'current:TEST6',
+    'current price:TEST6',
+
+], [
 
+    # we do not want to see these
+    '!credit:TEST7'
 
 ]);