You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2006/11/16 15:03:03 UTC
svn commit: r475742 - in /spamassassin/branches/jm_re2c_hacks:
lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm t/re_base_extraction.t
Author: jm
Date: Thu Nov 16 06:03:03 2006
New Revision: 475742
URL: http://svn.apache.org/viewvc?view=rev&rev=475742
Log:
fix base extraction on FUZZY_ rules; exclusions should be ignored
Modified:
spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm
spamassassin/branches/jm_re2c_hacks/t/re_base_extraction.t
Modified: spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm?view=diff&rev=475742&r1=475741&r2=475742
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm (original)
+++ spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm Thu Nov 16 06:03:03 2006
@@ -74,9 +74,7 @@
my ($self, $conf) = @_;
my $main = $conf->{main};
- if (!$main->{base_extract}) {
- return; # TODO: comment this for Rabin-Karp
- }
+ if (!$main->{base_extract}) { return; }
$self->extract_set($conf, $conf->{body_tests}, 'body');
}
@@ -313,6 +311,9 @@
$rule =~ s/\(\\b\|\^\)//gs;
$rule =~ s/\(\\b\|\$\)//gs;
+ # remove (?!credit)
+ $rule =~ s/\(\?\![^\)]+\)//gs;
+
# remove \b's
$rule =~ s/\\b//gs;
@@ -386,13 +387,18 @@
# still problematic; kill all "x?" statements
$rule =~ s/.\?.*$//gsx;
- # simplify (..)? and (..|) to (..|z{0})
- # this wierd construct is to work around an re2c bug; (..|) doesn't
- # do what it should
if ($main->{bases_can_use_alternations}) {
- $rule =~ s/\((.*?)\)\?/\($1\|z{0}\)/gs;
- $rule =~ s/\((.*?)\|\)/\($1\|z{0}\)/gs;
- $rule =~ s/\(\|(.*?)\)/\($1\|z{0}\)/gs;
+ $rule =~ s/\((.*?)\)\?/\($1\|\)/gs;
+ $rule =~ s/\((.*?)\|\)/\($1\|\)/gs;
+ $rule =~ s/\(\|(.*?)\)/\($1\|\)/gs;
+
+ # simplify (..)? and (..|) to (..|z{0}); this wierd construct is to work
+ # around an re2c bug; (..|) doesn't do what it should. off for now; re2c's
+ # alt support isn't actually usable anyway due to bugs with how it handles
+ # overlapping patterns.
+ #$rule =~ s/\((.*?)\)\?/\($1\|z{0}\)/gs;
+ #$rule =~ s/\((.*?)\|\)/\($1\|z{0}\)/gs;
+ #$rule =~ s/\(\|(.*?)\)/\($1\|z{0}\)/gs;
}
# re2xs doesn't like escaped brackets;
@@ -475,21 +481,21 @@
# count (...braces...) to ensure the numbers match up
my @c = ($rule =~ /(?<!\\)\(/g); my $brace_i = scalar @c;
@c = ($rule =~ /(?<!\\)\)/g); my $brace_o = scalar @c;
- if ($brace_i != $brace_o) { die "brace mismatch"; }
+ if ($brace_i != $brace_o) { die "brace mismatch in '$rule'"; }
}
# do the same for [charclasses]
{
my @c = ($rule =~ /(?<!\\)\[/g); my $brace_i = scalar @c;
@c = ($rule =~ /(?<!\\)\]/g); my $brace_o = scalar @c;
- if ($brace_i != $brace_o) { die "charclass mismatch"; }
+ if ($brace_i != $brace_o) { die "charclass mismatch in '$rule'"; }
}
# and {quantifiers}
{
my @c = ($rule =~ /(?<!\\)\{/g); my $brace_i = scalar @c;
@c = ($rule =~ /(?<!\\)\}/g); my $brace_o = scalar @c;
- if ($brace_i != $brace_o) { die "quantifier mismatch"; }
+ if ($brace_i != $brace_o) { die "quantifier mismatch in '$rule'"; }
}
# lookaheads that are just too far for the re2c parser
Modified: spamassassin/branches/jm_re2c_hacks/t/re_base_extraction.t
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/t/re_base_extraction.t?view=diff&rev=475742&r1=475741&r2=475742
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/t/re_base_extraction.t (original)
+++ spamassassin/branches/jm_re2c_hacks/t/re_base_extraction.t Thu Nov 16 06:03:03 2006
@@ -13,7 +13,7 @@
if (-e 't/test_dir') { chdir 't'; }
if (-e 'test_dir') { unshift(@INC, '../blib/lib'); }
- plan tests => 20;
+ plan tests => 25;
};
use lib '../lib';
@@ -27,7 +27,9 @@
body TEST3 /foody? bar/
body TEST4 /A(?i:ct) N(?i:ow)/
body TEST5 /time to refinance|refinanc\w{1,3}\b.{0,16}\bnow\b/i
- # body TEST6 /(?:Current|Target)(?: Price)?:\s+\$(?:O\.|\d\.O)/
+ body TEST6 /(?:Current|Target)(?: Price)?:\s+\$(?:O\.|\d\.O)/
+ body TEST7 /(?!credit)[ck\xc7\xe7@]\W?r\W?[e3\xc8\xc9\xca\xcb\xe8\xe9\xea\xeb\xa4]\W?[d\xd0]\W?[il|!1y?\xcc\xcd\xce\xcf\xec\xed\xee\xef]\W?t/i
+
', {
base_extract => 1,
@@ -49,7 +51,15 @@
'foody bar:TEST3 TEST2',
'refinanc:TEST5',
'time to refinance:TEST5',
+ 'target:TEST6',
+ 'target price:TEST6',
+ 'current:TEST6',
+ 'current price:TEST6',
+
+], [
+ # we do not want to see these
+ '!credit:TEST7'
]);