You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2006/11/07 00:01:02 UTC
svn commit: r471922 - in /spamassassin/branches/jm_re2c_hacks: ./ lib/Mail/
lib/Mail/SpamAssassin/Plugin/ rule2xs/ rule2xs/RabinKarpAccel-0.01/ t/
Author: jm
Date: Mon Nov 6 15:01:01 2006
New Revision: 471922
URL: http://svn.apache.org/viewvc?view=rev&rev=471922
Log:
add test cases for base-string extraction; add sa-compile script, used to compile the ruleset; use /var/lib/spamassassin/compiled/VERSION as the location for compiled rulesets
Added:
spamassassin/branches/jm_re2c_hacks/sa-compile.raw (with props)
spamassassin/branches/jm_re2c_hacks/t/re_base_extraction.t (with props)
Removed:
spamassassin/branches/jm_re2c_hacks/rule2xs/re2xs
spamassassin/branches/jm_re2c_hacks/run
Modified:
spamassassin/branches/jm_re2c_hacks/MANIFEST
spamassassin/branches/jm_re2c_hacks/Makefile.PL
spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin.pm
spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm
spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/Rule2XSBody.pm
spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/Makefile.PL
Modified: spamassassin/branches/jm_re2c_hacks/MANIFEST
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/MANIFEST?view=diff&rev=471922&r1=471921&r2=471922
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/MANIFEST (original)
+++ spamassassin/branches/jm_re2c_hacks/MANIFEST Mon Nov 6 15:01:01 2006
@@ -542,3 +542,4 @@
lib/Mail/SpamAssassin/Plugin/Rule2XSBody.pm
t/spamd_kill_restart.t
t/spamd_kill_restart_rr.t
+t/re_base_extraction.t
Modified: spamassassin/branches/jm_re2c_hacks/Makefile.PL
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/Makefile.PL?view=diff&rev=471922&r1=471921&r2=471922
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/Makefile.PL (original)
+++ spamassassin/branches/jm_re2c_hacks/Makefile.PL Mon Nov 6 15:01:01 2006
@@ -167,6 +167,7 @@
'spamassassin.raw' => 'spamassassin',
'sa-learn.raw' => 'sa-learn',
'sa-update.raw' => 'sa-update',
+ 'sa-compile.raw' => 'sa-compile',
'spamc/spamc.c' => 'spamc/spamc$(EXE_EXT)',
'spamd/spamd.raw' => 'spamd/spamd',
},
@@ -180,6 +181,7 @@
'lib/spamassassin-run.pod' => '$(INST_MAN1DIR)/spamassassin-run.$(MAN1EXT)',
'sa-learn' => '$(INST_MAN1DIR)/sa-learn.$(MAN1EXT)',
'sa-update' => '$(INST_MAN1DIR)/sa-update.$(MAN1EXT)',
+ 'sa-compile' => '$(INST_MAN1DIR)/sa-compile.$(MAN1EXT)',
'spamc/spamc.pod' => '$(INST_MAN1DIR)/spamc.$(MAN1EXT)',
'spamd/spamd' => '$(INST_MAN1DIR)/spamd.$(MAN1EXT)',
},
@@ -223,7 +225,7 @@
},
'clean' => { FILES => join(' ' =>
- 'sa-learn', 'sa-update', 'spamassassin',
+ 'sa-learn', 'sa-update', 'spamassassin', 'sa-compile',
'spamd/spamd',
@@ -1116,6 +1118,9 @@
sa-update: sa-update.raw build_rules
$(PREPROCESS) $(FIXBYTES) $(FIXVARS) $(FIXBANG) -m$(PERM_RWX) -isa-update.raw -osa-update
+
+sa-compile: sa-compile.raw
+ $(PREPROCESS) $(FIXBYTES) $(FIXVARS) $(FIXBANG) -m$(PERM_RWX) -isa-compile.raw -osa-compile
spamd/spamd: spamd/spamd.raw
$(PREPROCESS) $(FIXBYTES) $(FIXVARS) $(FIXBANG) -m$(PERM_RWX) -i$? -o$@
Modified: spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin.pm?view=diff&rev=471922&r1=471921&r2=471922
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin.pm (original)
+++ spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin.pm Mon Nov 6 15:01:01 2006
@@ -289,7 +289,8 @@
=item LOCAL_STATE_DIR
Location of the local state directory, mainly used for installing updates via
-C<sa-update>. Defaults to "@@LOCAL_STATE_DIR@@".
+C<sa-update> and compiling rulesets to native code. Defaults to
+"@@LOCAL_STATE_DIR@@".
=back
Modified: spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm?view=diff&rev=471922&r1=471921&r2=471922
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm (original)
+++ spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm Mon Nov 6 15:01:01 2006
@@ -20,8 +20,8 @@
=head1 SYNOPSIS
-This is a work-in-progress plugin to extract "base" strings from SpamAssassin
-'body' rules, suitable for use in Rule2XSBody rules.
+This is a plugin to extract "base" strings from SpamAssassin 'body' rules,
+suitable for use in Rule2XSBody rules or other parallel matching algorithms.
=cut
@@ -37,14 +37,17 @@
use vars qw(@ISA);
@ISA = qw(Mail::SpamAssassin::Plugin);
-# a few settings that control what kind of bases are output:
+# a few settings that control what kind of bases are output.
# treat all rules as lowercase for purposes of term extraction?
-my $BASES_MUST_BE_CASE_I = 1;
-my $BASES_CAN_USE_ALTERNATIONS = 0; # /(foo|bar|baz)/
-my $BASES_CAN_USE_QUANTIFIERS = 0; # /foo.*bar/ or /foo*bar/ or /foooo?bar/
-my $BASES_CAN_USE_CHAR_CLASSES = 0; # /fo[opqr]bar/
-my $SPLIT_OUT_ALTERNATIONS = 1; # /(foo|bar|baz)/ => ["foo", "bar", "baz"]
+# $main->{bases_must_be_casei} = 1;
+# $main->{bases_can_use_alternations} = 0; # /(foo|bar|baz)/
+# $main->{bases_can_use_quantifiers} = 0; # /foo.*bar/ or /foo*bar/ or /foooo?bar/
+# $main->{bases_can_use_char_classes} = 0; # /fo[opqr]bar/
+# $main->{bases_split_out_alternations} = 1; # /(foo|bar|baz)/ => ["foo", "bar", "baz"]
+
+# TODO: it would be nice to have a clean API to pass such settings
+# through to plugins instead of hanging them off $main
###########################################################################
@@ -70,32 +73,20 @@
sub extract_bases {
my ($self, $conf) = @_;
- # TODO: need a better way to do this rather than using an env
- # var as a back channel
- my $rawf = $ENV{'RULE_REGEXP_DUMP_FILE'};
- my $f;
-
- if ($rawf) {
- $rawf =~ /^(.*)$/;
- $f = $1; # untaint; allow anything here, it's from %ENV and safe
- }
- else {
+ my $main = $conf->{main};
+ if (!$main->{base_extract}) {
return; # TODO: comment this for Rabin-Karp
}
- $self->extract_set($f, $conf, $conf->{body_tests}, 'body');
+ $self->extract_set($conf, $conf->{body_tests}, 'body');
}
sub extract_set {
- my ($self, $dumpfile, $conf, $test_set, $ruletype) = @_;
+ my ($self, $conf, $test_set, $ruletype) = @_;
foreach my $pri (keys %{$test_set}) {
my $nicepri = $pri; $nicepri =~ s/-/neg/g;
$self->extract_set_pri($conf, $test_set->{$pri}, $ruletype.'_'.$nicepri);
-
- if ($dumpfile) {
- $self->dump_base_strings($dumpfile, $conf, $ruletype.'_'.$nicepri);
- }
}
}
@@ -109,6 +100,8 @@
my $yes = 0;
my $no = 0;
+ $self->{main} = $conf->{main}; # for use in extract_hints()
+
dbg("zoom: base extraction start for type $ruletype");
# attempt to find good "base strings" (simplified regexp subsets) for each
@@ -254,27 +247,7 @@
$conf->{base_string}->{$ruletype}->{$base} = $key;
}
- warn ("zoom: base extraction complete for $ruletype: yes=$yes no=$no\n");
-}
-
-###########################################################################
-
-sub dump_base_strings {
- my ($self, $dumpfile, $conf, $ruletype) = @_;
-
- open (OUT, ">$dumpfile") or die "cannot write to $dumpfile!";
- print OUT "name $ruletype\n";
-
- foreach my $key1 (sort keys %{$conf->{base_orig}->{$ruletype}}) {
- print OUT "orig $key1 $conf->{base_orig}->{$ruletype}->{$key1}\n";
- }
-
- foreach my $key (sort keys %{$conf->{base_string}->{$ruletype}}) {
- print OUT "r $key:$conf->{base_string}->{$ruletype}->{$key}\n";
- }
- close OUT or die "close failed on $dumpfile!";
-
- warn ("zoom: bases written to '$dumpfile'\n");
+ info ("zoom: base extraction complete for $ruletype: yes=$yes no=$no\n");
}
###########################################################################
@@ -290,6 +263,7 @@
my $rule = shift;
my $is_reversed = shift;
+ my $main = $self->{main};
my $orig = $rule;
$rule = Mail::SpamAssassin::Util::regexp_remove_delimiters($rule);
@@ -310,7 +284,7 @@
# anyway. Simplification that causes the regexp to *not* hit
# stuff that the "real" rule would hit, however, is a bad thing.
- if ($BASES_MUST_BE_CASE_I) {
+ if ($main->{bases_must_be_casei}) {
$rule = lc $rule;
$mods =~ s/i//;
@@ -355,7 +329,8 @@
$rule =~ s/\(\?:/\(/g;
# this must be before reversing
- if ($BASES_CAN_USE_ALTERNATIONS||$SPLIT_OUT_ALTERNATIONS) {
+ if ($main->{bases_can_use_alternations}||$main->{bases_split_out_alternations})
+ {
# /foo (bar)? baz/ simplify to /foo (bar|) baz/
$rule =~ s/(?<!\\)(\([^\(\)]*)\)\?/$1\|\)/gs;
@@ -380,21 +355,21 @@
\\[ABCE-RT-VX-Z]
).*$//gsx;
- $BASES_CAN_USE_CHAR_CLASSES or $rule =~ s/(?<!\\)(?:
+ $main->{bases_can_use_char_classes} or $rule =~ s/(?<!\\)(?:
\\\w|
\.|
\[|
\]
).*$//gsx;
- $BASES_CAN_USE_QUANTIFIERS or $rule =~ s/(?<!\\)(?:
+ $main->{bases_can_use_quantifiers} or $rule =~ s/(?<!\\)(?:
.\*| # remove the quantified char, too
.\+|
.\?|
.\{
).*$//gsx;
- ($BASES_CAN_USE_ALTERNATIONS||$SPLIT_OUT_ALTERNATIONS) or
+ ($main->{bases_can_use_alternations}||$main->{bases_split_out_alternations}) or
$rule =~ s/(?<!\\)(?:
\(|
\)
@@ -414,7 +389,7 @@
# simplify (..)? and (..|) to (..|z{0})
# this wierd construct is to work around an re2c bug; (..|) doesn't
# do what it should
- if ($BASES_CAN_USE_ALTERNATIONS) {
+ if ($main->{bases_can_use_alternations}) {
$rule =~ s/\((.*?)\)\?/\($1\|z{0}\)/gs;
$rule =~ s/\((.*?)\|\)/\($1\|z{0}\)/gs;
$rule =~ s/\(\|(.*?)\)/\($1\|z{0}\)/gs;
@@ -488,7 +463,7 @@
}
# return for things we know we can't handle.
- if (!($BASES_CAN_USE_ALTERNATIONS||$SPLIT_OUT_ALTERNATIONS)) {
+ if (!($main->{bases_can_use_alternations}||$main->{bases_split_out_alternations})) {
if ($rule =~ /\|/) {
# /time to refinance|refinanc\w{1,3}\b.{0,16}\bnow\b/i
die "alternations";
@@ -529,7 +504,7 @@
}
my @rules;
- if ($SPLIT_OUT_ALTERNATIONS && $rule =~ /\|/) {
+ if ($main->{bases_split_out_alternations} && $rule =~ /\|/) {
@rules = $self->split_alt($rule);
}
else {
Modified: spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/Rule2XSBody.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/Rule2XSBody.pm?view=diff&rev=471922&r1=471921&r2=471922
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/Rule2XSBody.pm (original)
+++ spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/Rule2XSBody.pm Mon Nov 6 15:01:01 2006
@@ -14,6 +14,24 @@
# limitations under the License.
# </...@LICENSE>
+=head1 NAME
+
+Mail::SpamAssassin::Plugin::Rule2XSBody - speed up SpamAssassin by compiling regexps
+
+=head1 SYNOPSIS
+
+ loadplugin Mail::SpamAssassin::Plugin::Rule2XSBody
+
+=head1 DESCRIPTION
+
+This plugin will use native-code object files representing the ruleset,
+in order to provide significant speedups in rule evaluation.
+
+Note that C<sa-compile> must be run in advance, in order to compile the
+ruleset using C<re2c> and the C compiler.
+
+=cut
+
package Mail::SpamAssassin::Plugin::Rule2XSBody;
use Mail::SpamAssassin::Plugin;
@@ -41,6 +59,12 @@
sub finish_parsing_end {
my ($self, $params) = @_;
my $conf = $params->{conf};
+
+ my $instdir = $conf->{main}->sed_path
+ ('__local_state_dir__/compiled/__version__');
+ push @INC, $instdir, "$instdir/auto";
+ dbg "zoom: loading compiled ruleset from $instdir";
+
$self->setup_test_set ($conf, $conf->{body_tests}, 'body');
}
@@ -97,9 +121,10 @@
my $totalhasrules = scalar keys %{$hasrules};
my $pc_zoomed = ($found / ($totalhasrules || .001)) * 100;
+ $pc_zoomed = int($pc_zoomed * 1000) / 1000;
- dbg("zoom: $found compiled rules are available for type $ruletype; ".
- "$pc_zoomed\% were usable");
+ dbg("zoom: $found compiled rules are available for type $ruletype out ".
+ "of $totalhasrules ($pc_zoomed\%)");
$conf->{zoom_ruletypes_available} ||= { };
$conf->{zoom_ruletypes_available}->{$ruletype} = 1;
Modified: spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/Makefile.PL
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/Makefile.PL?view=diff&rev=471922&r1=471921&r2=471922
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/Makefile.PL (original)
+++ spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/Makefile.PL Mon Nov 6 15:01:01 2006
@@ -8,7 +8,7 @@
PREREQ_PM => {}, # e.g., Module::Name => 1.1
($] >= 5.005 ? ## Add these new keywords supported since 5.005
(ABSTRACT_FROM => 'lib/RabinKarpAccel.pm', # retrieve abstract from module
- AUTHOR => 'A. U. Thor <jm@>') : ()),
+ AUTHOR => 'Justin Mason <jm...@jmason.org>') : ()),
LIBS => [''], # e.g., '-lm'
DEFINE => '', # e.g., '-DHAVE_SOMETHING'
INC => '-I.', # e.g., '-I. -I/usr/include/other'
Added: spamassassin/branches/jm_re2c_hacks/sa-compile.raw
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/sa-compile.raw?view=auto&rev=471922
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/sa-compile.raw (added)
+++ spamassassin/branches/jm_re2c_hacks/sa-compile.raw Mon Nov 6 15:01:01 2006
@@ -0,0 +1,859 @@
+#!/usr/bin/perl -w -T
+
+# <@LICENSE>
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at:
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </...@LICENSE>
+
+my $PREFIX = '@@PREFIX@@'; # substituted at 'make' time
+my $DEF_RULES_DIR = '@@DEF_RULES_DIR@@'; # substituted at 'make' time
+my $LOCAL_RULES_DIR = '@@LOCAL_RULES_DIR@@'; # substituted at 'make' time
+my $LOCAL_STATE_DIR = '@@LOCAL_STATE_DIR@@'; # substituted at 'make' time
+use lib '@@INSTALLSITELIB@@'; # substituted at 'make' time
+
+use File::Spec;
+
+BEGIN {
+ my @bin = File::Spec->splitpath($0);
+ my $bin = ($bin[0] ? File::Spec->catpath(@bin[0..1]) : $bin[1])
+ || File::Spec->curdir;
+
+ if (-e $bin.'/lib/Mail/SpamAssassin.pm'
+ || !-e '@@INSTALLSITELIB@@/Mail/SpamAssassin.pm')
+ {
+ if ( $bin eq '../' && -e '../blib/lib/Mail/SpamAssassin.pm' ) {
+ unshift(@INC, '../blib/lib');
+ }
+ else {
+ foreach (qw(lib ../lib/site_perl
+ ../lib/spamassassin ../share/spamassassin/lib))
+ {
+ my $dir = File::Spec->catdir($bin, split('/', $_));
+ if(-f File::Spec->catfile($dir, "Mail", "SpamAssassin.pm")) {
+ unshift(@INC, $dir); last;
+ }
+ }
+ }
+ }
+}
+
+use strict;
+use warnings;
+
+use Mail::SpamAssassin;
+use Getopt::Long;
+use File::Copy;
+use File::Path;
+
+use vars qw( %opt );
+Mail::SpamAssassin::Util::clean_path_in_taint_mode();
+Mail::SpamAssassin::Util::untaint_var( \%ENV );
+
+##############################################################################
+
+Getopt::Long::Configure(
+ qw(bundling no_getopt_compat
+ permute no_auto_abbrev no_ignore_case)
+);
+
+GetOptions(
+ 'list' => \$opt{'list'},
+
+ 'configpath|config-file|config-dir|c|C=s' => \$opt{'configpath'},
+ 'prefspath|prefs-file|p=s' => \$opt{'prefspath'},
+ 'siteconfigpath=s' => \$opt{'siteconfigpath'},
+ 'cf=s' => \@{$opt{'cf'}},
+ 'debug-level|D:s' => \$opt{'debug'},
+ 'help|h|?' => \$opt{'help'},
+ 'version|V' => \$opt{'version'},
+ )
+ or usage( 0, "Unknown option!" );
+
+if ( defined $opt{'help'} ) {
+ usage( 0, "For more information read the manual page" );
+}
+if ( defined $opt{'version'} ) {
+ print "SpamAssassin version " . Mail::SpamAssassin::Version() . "\n";
+ exit 0;
+}
+
+# set debug areas, if any specified (only useful for command-line tools)
+if (defined $opt{'debug'}) {
+ $opt{'debug'} ||= 'all';
+}
+
+# ensure the body-rule base extractor plugin is loaded, we use that
+my $post_config = q(
+ loadplugin Mail::SpamAssassin::Plugin::BodyRuleBaseExtractor
+).join("\n", @{$opt{'cf'}})."\n";
+
+my $spamtest = new Mail::SpamAssassin(
+ {
+ rules_filename => $opt{'configpath'},
+ site_rules_filename => $opt{'siteconfigpath'},
+ userprefs_filename => $opt{'prefspath'},
+ debug => $opt{'debug'},
+ local_tests_only => 1,
+ dont_copy_prefs => 1,
+ PREFIX => $PREFIX,
+ DEF_RULES_DIR => $DEF_RULES_DIR,
+ LOCAL_RULES_DIR => $LOCAL_RULES_DIR,
+ LOCAL_STATE_DIR => $LOCAL_STATE_DIR,
+ post_config_text => $post_config,
+ }
+);
+
+# appropriate settings for rule2xs usage
+$spamtest->{base_extract} = 1;
+$spamtest->{bases_must_be_casei} = 1;
+$spamtest->{bases_can_use_alternations} = 0;
+$spamtest->{bases_can_use_quantifiers} = 0;
+$spamtest->{bases_can_use_char_classes} = 0;
+$spamtest->{bases_split_out_alternations} = 0;
+
+$spamtest->init(1);
+
+# this actually extracts the base rules in the plugin, as a side-effect
+my $res = $spamtest->lint_rules();
+
+if ($res) {
+ die "not compiling; 'spamassassin --lint' check failed!\n";
+}
+
+if ( defined $opt{'list'} ) {
+ print dump_base_strings();
+}
+else {
+ compile_base_strings();
+}
+
+$spamtest->finish();
+exit;
+
+##############################################################################
+
+sub dump_base_strings {
+ my $conf = $spamtest->{conf};
+
+ my $s = '';
+ foreach my $ruletype (sort keys %{$conf->{base_orig}}) {
+ $s .= "name $ruletype\n";
+
+ foreach my $key1 (sort keys %{$conf->{base_orig}->{$ruletype}}) {
+ $s .= "orig $key1 $conf->{base_orig}->{$ruletype}->{$key1}\n";
+ }
+
+ foreach my $key (sort keys %{$conf->{base_string}->{$ruletype}}) {
+ $s .= "r $key:$conf->{base_string}->{$ruletype}->{$key}\n";
+ }
+ }
+ return $s;
+}
+
+##############################################################################
+
+sub compile_base_strings {
+ my $dirpath = Mail::SpamAssassin::Util::secure_tmpdir();
+ die "secure_tmpdir failed" unless $dirpath && -w $dirpath;
+
+ open OUT, ">$dirpath/bases.in"
+ or die "cannot write to $dirpath/bases.in";
+ print OUT dump_base_strings();
+ close OUT or die "cannot write to $dirpath/bases.in";
+
+ chdir $dirpath; print "cd $dirpath\n";
+ rule2xs("bases.in");
+
+ my $instdir = $spamtest->sed_path('__local_state_dir__/compiled/__version__');
+
+ run(get_perl()." Makefile.PL ".
+ "PREFIX=$dirpath/ignored INSTALLSITEARCH=$instdir");
+
+ run("make install"); # into $instdir
+}
+
+sub run {
+ my @cmd = @_;
+ print join(' ',@cmd)."\n";
+ system(@cmd);
+ ($?>>8 != 0) and die "command failed!";
+}
+
+sub get_perl {
+ my $perl;
+ if ($^X =~ m|^/|) {
+ $perl = $^X;
+ } else {
+ use Config;
+ $perl = $Config{perlpath};
+ $perl =~ s|/[^/]*$|/$^X|;
+ }
+ $perl =~ /^(.*)$/;
+ return $1;
+}
+
+##############################################################################
+
+use constant MAX_RULES_PER_C_FILE => 200;
+
+sub rule2xs {
+ my $modname;
+ my $force = 1;
+ my $FILE = shift;
+
+ open(my $fh, "sort $FILE |") || die "open($FILE): $!";
+# read ruleset name from the first line in the file
+ my $ruleset_name;
+ $_ = <$fh>;
+ if (/^name\s+(\S+)/) {
+ $ruleset_name = $1;
+ }
+
+ if (!$modname) {
+ $modname = "Mail::SpamAssassin::CompiledRegexps::$ruleset_name";
+ }
+
+ our $PATH = $modname;
+ $PATH =~ s/::/-/g;
+ our $PMFILE = $modname;
+ $PMFILE =~ s/.*:://;
+ $PMFILE .= ".pm";
+ our $XSFILE = $PMFILE;
+ $XSFILE =~ s/\.pm$/.xs/;
+
+ $force and rmtree $PATH;
+ mkdir $PATH or (!$force and die "mkdir($PATH): $!");
+ chdir $PATH; print "cd $PATH\n";
+
+ my $cprefix = $modname; $cprefix =~ s/[^A-ZA-z0-9]+/_/gs;
+
+ my $numscans = 0;
+ my (@dot_star, @dot_plus);
+
+ my $has_rules = '';
+
+ while (!eof($fh)) {
+ $numscans++;
+
+ open(my $re, ">scanner${numscans}.re") || die "open(>scanner{$numscans}.re): $!";
+
+ print $re <<EOT;
+#define NULL ((char*) 0)
+#define YYCTYPE unsigned char
+#define YYCURSOR *p
+#define YYLIMIT *p
+#define YYMARKER q
+#define YYFILL(n)
+EOT
+
+ print $re <<EOT;
+char *${cprefix}_scan${numscans}(unsigned char **p){
+unsigned char *q;
+/*!re2c
+EOT
+
+ my $line = 0;
+ while (<$fh>) {
+ next if /^#/;
+
+ if (/^orig\s+(\S+)\s+(.*)$/) {
+ my $name = $1;
+ my $regexp = $2;
+ $name =~ s/#/[hash]/gs;
+ $regexp =~ s/#/[hash]/gs;
+ $has_rules .= " q#$name# => q#$regexp#,\n";
+ next;
+ }
+
+ my ($regexp, $reason) = /^r (.*):(.*)$/;
+ die "no 'r REGEXP:REASON' in $_" unless defined $regexp;
+
+ if ($regexp =~ /^\.\*/) {
+ push @dot_star, "$regexp:$reason";
+ next;
+ }
+ elsif ($regexp =~ /^\.\+/) {
+ push @dot_plus, "$regexp:$reason";
+ next;
+ }
+ eval {
+ print $re "\t", fixup_re($regexp), " {return \"$reason\";}\n";
+ $line++;
+ };
+ $@ and handle_fixup_error($@, $regexp, $reason);
+ last if $line == MAX_RULES_PER_C_FILE;
+ }
+
+ print $re <<EOT;
+ [\\000-\\377] { return NULL; }
+*/
+}
+EOT
+
+ #last if $numscans == 2;
+ }
+
+ while (@dot_star) {
+ $numscans++;
+
+ open(my $re, ">scanner${numscans}.re") ||
+ die "open(>scanner{$numscans}.re): $!";
+
+ print $re <<EOT;
+#define NULL ((char*) 0)
+#define YYCTYPE unsigned char
+#define YYCURSOR *p
+#define YYLIMIT *p
+#define YYMARKER q
+#define YYFILL(n)
+EOT
+
+ print $re <<EOT;
+char *${cprefix}_scan${numscans}(unsigned char **p){
+unsigned char *q;
+start:
+/*!re2c
+EOT
+
+ my $line = 0;
+ while ($_ = shift @dot_star) {
+ my ($regexp, $reason) = /^(.*):(.*)$/;
+ $regexp =~ s/^\.\*//;
+ eval {
+ print $re "\t", fixup_re($regexp), " {return \"$reason\";}\n";
+ $line++;
+ };
+ $@ and handle_fixup_error($@, $regexp, $reason);
+ last if $line == MAX_RULES_PER_C_FILE;
+ }
+
+ print $re <<EOT;
+ [\\001-\\377] { goto start; }
+ [\\000] {return NULL; }
+*/
+}
+EOT
+ }
+
+ while (@dot_plus) {
+ $numscans++;
+
+ open(my $re, ">scanner${numscans}.re") ||
+ die "open(>scanner{$numscans}.re): $!";
+
+ print $re <<EOT;
+#define NULL ((char*) 0)
+#define YYCTYPE unsigned char
+#define YYCURSOR *p
+#define YYLIMIT *p
+#define YYMARKER q
+#define YYFILL(n)
+EOT
+
+ print $re <<EOT;
+char *${cprefix}_scan${numscans}(unsigned char **p){
+unsigned char *q;
+p++;
+start:
+/*!re2c
+EOT
+
+ my $line = 0;
+ while ($_ = shift @dot_plus) {
+ my ($regexp, $reason) = /^(.*):(.*)$/;
+ $regexp =~ s/^\.\+//;
+
+ eval {
+ print $re "\t", fixup_re($regexp), " {return \"$reason\";}\n";
+ $line++;
+ };
+ $@ and handle_fixup_error($@, $regexp, $reason);
+ last if $line == MAX_RULES_PER_C_FILE;
+ }
+
+ print $re <<EOT;
+ [\\001-\\377] { goto start; }
+ [\\000] {return NULL; }
+*/
+
+}
+EOT
+ }
+
+ for (1..$numscans) {
+ # print "[re2c for block $_ / $numscans]\n";
+
+ my $cmd = "re2c -i -b -o scanner$_.c scanner$_.re";
+ run($cmd);
+
+ # this must be fatal; it can result in corrupt output modules missing
+ # scannerN() functions
+ if ($? >> 8 != 0) {
+ my $cwd = `pwd`; chop $cwd;
+ die "'$cmd' failed, dying!\n".
+ "see $cwd/scanner$_.re\n";
+ }
+ }
+
+ open(FILE, ">Makefile.PL") || die "write Makefile.PL: $!";
+ print FILE <<"EOT";
+ use ExtUtils::MakeMaker;
+
+ WriteMakefile(
+ 'NAME' => '$modname',
+ 'VERSION_FROM' => '$PMFILE',
+ 'ABSTRACT_FROM' => '$PMFILE',
+ 'OBJECT' => '\$(O_FILES)',
+ 'OPTIMIZE' => '-O2',
+ 'AUTHOR' => 'A. U. Tomated <au...@example.com>',
+ );
+EOT
+
+ open(FILE, ">MANIFEST.SKIP") || die "write MANIFEST.SKIP: $!";
+ print FILE <<'EOT';
+CVS/.*
+\.bak$
+\.sw[a-z]$
+\.tar$
+\.tgz$
+\.tar\.gz$
+\.o$
+\.xsi$
+\.bs$
+^.#
+^tmp/
+^blib/
+^Makefile$
+^Makefile\.[a-z]+$
+^pm_to_blib$
+~$
+EOT
+
+ open(my $re, ">$XSFILE") || die "write $XSFILE: $!";
+ print $re <<"EOT";
+#include "EXTERN.h"
+#include "perl.h"
+#include "XSUB.h"
+
+ /* split single-space-separated result string */
+ static void
+ split_and_add (AV *results, char *match)
+ {
+ char *wordstart, *cp;
+
+ for (cp = wordstart = match; *cp != (unsigned char) 0; cp++) {
+ if (*cp == ' ') {
+ av_push(results,
+ newSVpvn_share(wordstart, cp-wordstart, (U32)0));
+ wordstart = cp + 1;
+ }
+ }
+ av_push(results,
+ newSVpvn_share(wordstart, cp-wordstart, (U32)0));
+ }
+
+MODULE = $modname PACKAGE = $modname
+
+PROTOTYPES: DISABLE
+
+SV *
+scan(psv)
+ SV* psv
+
+ PREINIT:
+ int i;
+ char *match;
+ unsigned char *cursor;
+ unsigned char *pstart;
+ unsigned char *pend;
+ STRLEN plen;
+ AV *results;
+
+ CODE:
+ pstart = (unsigned char *) SvPVutf8(psv, plen);
+ pend = pstart + plen;
+ results = (AV *) sv_2mortal((SV *) newAV());
+EOT
+
+ for (1..$numscans) {
+ my $funcname = $cprefix."_scan".$_;
+
+ print $re <<EOT;
+ extern char *${funcname} (unsigned char **);
+
+ cursor = pstart;
+ while (cursor < pend) {
+ while (match = ${funcname} (\&cursor)) {
+ split_and_add(results, match);
+ }
+ }
+EOT
+
+ }
+
+ print $re <<EOT;
+ RETVAL = newRV((SV *) results);
+ OUTPUT:
+ RETVAL
+
+EOT
+
+ close($re);
+
+ open(FILE, ">$PMFILE") || die "write $PMFILE: $!";
+ print FILE <<"EOT";
+
+package $modname;
+
+use strict;
+use vars qw(\$VERSION \@ISA \@EXPORT_OK);
+
+use DynaLoader ();
+
+BEGIN {
+\$VERSION = '1.0';
+\@ISA = qw(DynaLoader);
+\@EXPORT_OK = qw();
+
+our \$HAS_RULES = {
+ $has_rules
+};
+
+bootstrap $modname \$VERSION;
+}
+
+1;
+__END__
+
+=head1 NAME
+
+$modname - Efficient string matching for regexps found in $FILE
+
+=head1 SYNOPSIS
+
+ use $modname;
+
+ ...
+ my \$match = ${modname}::scan(\$string);
+
+=head1 DESCRIPTION
+
+This module was created by SpamAssassin with the aid of re2xs, which uses re2c
+to create an XS library capable of scanning through a bunch of regular
+expressions as defined in F<$FILE>.
+
+See C<sa-compile> for more details.
+
+=cut
+EOT
+
+}
+
+sub fixup_re {
+ my $re = shift;
+
+ # print "FIXUP: $re\n";
+
+ my $output = "";
+ my $TOK = qr([\{\^\$\"\(\)\[\|\\\.\+\*\?]);
+
+ my $STATE;
+ while ($re =~ /\G(.*?)($TOK)/gc) {
+ my $pre = $1;
+ my $tok = $2;
+
+ if (length($pre)) {
+ $output .= "\"$pre\"";
+ }
+
+ if ($tok eq "(") {
+ # Grouping
+ my $origpos = pos $re;
+ if ($re =~ /\G(.*)\)/gc) {
+ # trim that down, if necessary, to just the matching
+ # (...) substring.
+ my $subre = find_balanced_group($1);
+ $output .= "( " . fixup_re($subre) . " )";
+ # reset m//g offset to the end of that group, so that
+ # the next token is taken from the right place
+ pos $re = $origpos + length($subre) + 1;
+ }
+ else {
+ die "re: $re doesn't have group closing bracket";
+ }
+ }
+ elsif ($tok eq "|") {
+ $output .= " | ";
+ }
+ elsif ($tok eq ")") {
+ $output .= " ) ";
+ }
+ elsif ($tok eq "[") {
+ # chars
+ if ($re =~ /\G(.*?)\]/gc) {
+ $output .= "[$1]";
+ }
+ else {
+ die "re: $re doesn't have character class closing bracket";
+ }
+ }
+ elsif ($tok eq '{') {
+ if ($re =~ /\G(.*?)\}/gc) {
+ $output .= "{$1}";
+ }
+ else {
+ die "re: $re doesn't have quantifier closing bracket";
+ }
+ }
+ elsif ($tok eq '.') {
+ $output .= '.';
+ }
+ elsif ($tok eq '"') {
+ $output .= '"\\'; # reversed later to form \"
+ }
+ elsif ($tok eq '*') {
+ # /guaranteed*/ => "guarantee" "d"* , not "guaranteed"*
+ $output =~ s/(.)\"\s*$/\" \"$1\"/;
+ $output .= '* ';
+ }
+ elsif ($tok eq '?') {
+ # /guaranteed?/ => "guarantee" "d"? , not "guaranteed"?
+ $output =~ s/(.)\"\s*$/\" \"$1\"/;
+ $output .= '? ';
+ }
+ elsif ($tok eq '+') {
+ # /guaranteed+/ => "guarantee" "d"+ , not "guaranteed"+
+ $output =~ s/(.)\"\s*$/\" \"$1\"/;
+ $output .= '+ ';
+ }
+ elsif ($tok eq "^") {
+ die "Unsupported anchor: \\$tok";
+ }
+ elsif ($tok eq "\$") {
+ die "Unsupported anchor: \\$tok";
+ }
+ elsif ($tok eq '\\') {
+ $re =~ /\G(.)/gc or die "\\ at end of string!";
+ my $esc = $1;
+ if ($esc !~ /^
+ [\.\@\$\(\)\/\-\+\*\^\?\!_]
+ $/x)
+ {
+ die "Unsupported escape: \\$esc";
+ }
+ $output .= "\"$esc\"";
+ }
+ else {
+ print "PRE: $pre\nTOK: $tok\n";
+ }
+ }
+
+ if (!defined(pos($re))) {
+ # no matches
+ $output .= "\"$re\"";
+ }
+ elsif (pos($re) <= length($re)) {
+ $output .= fixup_re(substr($re, pos($re)));
+ }
+
+ $output =~ s/""//g; # strip empty strings, or turn "abc""def" -> "abcdef"
+ # print "OUTPUT: $output\n";
+ return $output;
+}
+
+sub handle_fixup_error {
+ my ($strat, $regexp, $reason) = @_;
+ if ($strat) {
+ warn "skipped: $regexp: $strat";
+ }
+}
+
+sub find_balanced_group {
+ my ($re) = @_;
+
+ # $re could be:
+ #
+ # foo
+ # foo|bar|baz
+ # foo(aa|bb|cc)|bar|baz
+ # foo(aa|(bb|cc|ddd))|bar(e|f)
+ #
+ # or, due to inefficient regexp parsing above:
+ #
+ # foo) totally_unrelated_text (next_grouping => "foo"
+ #
+ # all we have to do here is ensure that we correctly pick out
+ # the balanced group.
+
+ my $output = "";
+ my $TOK = qr([\(\\\)]);
+ my $level = 0;
+
+ my $STATE;
+ while ($re =~ /\G(.*?)($TOK)/gc) {
+ my $pre = $1;
+ my $tok = $2;
+
+ if (length($pre)) {
+ $output .= $pre;
+ }
+
+ if ($tok eq "(") {
+ $output .= $tok;
+ $level++;
+ }
+ elsif ($tok eq ")") {
+ $level--;
+ if ($level < 0) {
+ # we found the matching close-bracket; stop searching here
+ # warn "JMD partial $re => $output";
+ return $output;
+ }
+ $output .= $tok;
+ }
+ elsif ($tok eq '\\') {
+ $re =~ /\G(.)/gc or die "\\ at end of string!";
+ my $esc = $1;
+ $output .= $tok.$esc;
+ }
+ else {
+ $output .= $tok;
+ }
+ }
+
+ if (!defined(pos($re))) {
+ $output .= $re;
+ }
+ elsif (pos($re) <= length($re)) {
+ $output .= substr($re, pos($re));
+ }
+
+ # we hit end-of-string; in other words, it was a balanced re anyway
+ # warn "JMD fully balanced $re";
+ return $re;
+}
+
+##############################################################################
+
+=cut
+
+=head1 NAME
+
+sa-compile - compile SpamAssassin ruleset into native code
+
+=head1 SYNOPSIS
+
+B<sa-compile> [options]
+
+Options:
+
+ --list Output base string list to STDOUT
+
+ -C path, --configpath=path, --config-file=path Path to standard configuration dir
+ -p prefs, --prefspath=file, --prefs-file=file Set user preferences file
+ --siteconfigpath=path Path for site configs (def: /etc/mail/spamassassin)
+ --cf='config line' Additional line of configuration
+
+ -D, --debug [area=n,...] Print debugging messages
+ -V, --version Print version
+ -h, --help Print usage message
+
+=head1 DESCRIPTION
+
+sa-compile uses C<re2c> to compile the SpamAssassin ruleset into C code in a
+Perl XS module, and from there into native object code. This will then be used
+by the C<Mail::SpamAssassin::Plugin::Rule2XSBody> plugin to speed up
+SpamAssassin's operation, where possible.
+
+This requires C<re2c> (see C<http://re2c.org/>), and the C compiler used to
+build Perl XS modules, be installed.
+
+=head1 OPTIONS
+
+=over 4
+
+=item B<--list>
+
+Output the extracted base strings to STDOUT, instead of generating
+the C extension code.
+
+=item B<-C> I<path>, B<--configpath>=I<path>, B<--config-file>=I<path>
+
+Use the specified path for locating the distributed configuration files.
+Ignore the default directories (usually C</usr/share/spamassassin> or similar).
+
+=item B<--siteconfigpath>=I<path>
+
+Use the specified path for locating site-specific configuration files. Ignore
+the default directories (usually C</etc/mail/spamassassin> or similar).
+
+=item B<--cf='config line'>
+
+Add additional lines of configuration directly from the command-line, parsed
+after the configuration files are read. Multiple B<--cf> arguments can be
+used, and each will be considered a separate line of configuration.
+
+=item B<-p> I<prefs>, B<--prefspath>=I<prefs>, B<--prefs-file>=I<prefs>
+
+Read user score preferences from I<prefs> (usually
+C<$HOME/.spamassassin/user_prefs>) .
+
+=item B<-D> [I<area,...>], B<--debug> [I<area,...>]
+
+Produce debugging output. If no areas are listed, all debugging information is
+printed. Diagnostic output can also be enabled for each area individually;
+I<area> is the area of the code to instrument.
+
+For more information about which areas (also known as channels) are available,
+please see the documentation at:
+
+ C<http://wiki.apache.org/spamassassin/DebugChannels>
+
+=item B<-h>, B<--help>
+
+Print help message and exit.
+
+=item B<-V>, B<--version>
+
+Print sa-compile version and exit.
+
+=back
+
+=head1 SEE ALSO
+
+Mail::SpamAssassin(3)
+spamassassin(1)
+spamd(1)
+
+=head1 PREREQUESITES
+
+C<Mail::SpamAssassin>
+C<re2c>
+
+=head1 BUGS
+
+See <http://issues.apache.org/SpamAssassin/>
+
+=head1 AUTHORS
+
+The Apache SpamAssassin(tm) Project <http://spamassassin.apache.org/>
+
+=head1 COPYRIGHT
+
+SpamAssassin is distributed under the Apache License, Version 2.0, as
+described in the file C<LICENSE> included with the distribution.
+
+=cut
+
Propchange: spamassassin/branches/jm_re2c_hacks/sa-compile.raw
------------------------------------------------------------------------------
svn:executable = *
Added: spamassassin/branches/jm_re2c_hacks/t/re_base_extraction.t
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/t/re_base_extraction.t?view=auto&rev=471922
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/t/re_base_extraction.t (added)
+++ spamassassin/branches/jm_re2c_hacks/t/re_base_extraction.t Mon Nov 6 15:01:01 2006
@@ -0,0 +1,134 @@
+#!/usr/bin/perl
+
+# Test regular expression base-string extraction in
+# Mail::SpamAssassin::Plugin::BodyRuleBaseExtractor
+
+use lib '.'; use lib 't';
+use SATest; sa_t_init("re_base_extraction");
+use Test;
+use strict;
+use warnings;
+
+BEGIN {
+ if (-e 't/test_dir') { chdir 't'; }
+ if (-e 'test_dir') { unshift(@INC, '../blib/lib'); }
+
+ plan tests => 20;
+
+};
+
+try_extraction ('
+ body FOO /foo bar/
+ body EXCUSE_REMOVE /to be removed from.{0,20}(?:mailings|offers)/i
+ body KAM_STOCKTIP15 /(?:Nano Superlattice Technology|NSLT)/is
+ body TEST1 /foo(?:ish)? bar/
+ body TEST2 /foody* bar/
+ body TEST3 /foody? bar/
+ body TEST4 /A(?i:ct) N(?i:ow)/
+ body TEST5 /time to refinance|refinanc\w{1,3}\b.{0,16}\bnow\b/i
+
+', {
+ base_extract => 1,
+ bases_must_be_casei => 1,
+ bases_can_use_alternations => 0,
+ bases_can_use_quantifiers => 0,
+ bases_can_use_char_classes => 0,
+ bases_split_out_alternations => 1
+}, [
+
+ 'foo bar:TEST1 FOO',
+ 'to be removed from:EXCUSE_REMOVE',
+ 'nslt:KAM_STOCKTIP15',
+ 'nano superlattice technology:KAM_STOCKTIP15',
+ 'fooish bar:TEST1',
+ 'act now:TEST4',
+ 'food:TEST2',
+ 'food bar:TEST3 TEST2',
+ 'foody bar:TEST3 TEST2',
+ 'refinanc:TEST5',
+ 'time to refinance:TEST5',
+
+
+]);
+
+try_extraction ('
+ body FOO /foo bar/
+ body EXCUSE_REMOVE /to be removed from.{0,20}(?:mailings|offers)/i
+ body KAM_STOCKTIP15 /(?:Nano Superlattice Technology|NSLT)/is
+ body TEST1 /foo(?:ish)? bar/
+
+', {
+ base_extract => 1,
+ bases_must_be_casei => 1,
+ bases_can_use_alternations => 0,
+ bases_can_use_quantifiers => 0,
+ bases_can_use_char_classes => 0,
+ bases_split_out_alternations => 0
+}, [
+
+ 'foo bar:FOO',
+ 'to be removed from:EXCUSE_REMOVE',
+],[
+
+ 'foo bar:FOO TEST1',
+ 'nano superlattice technology:KAM_STOCKTIP15',
+ 'fooish bar:TEST1'
+
+]);
+###########################################################################
+
+use Mail::SpamAssassin;
+
+sub try_extraction {
+ my ($rules, $params, $output, $notoutput) = @_;
+
+ my $sa = Mail::SpamAssassin->new({
+ rules_filename => "log/test_rules_copy",
+ site_rules_filename => "log/test_default.cf",
+ userprefs_filename => "log/userprefs.cf",
+ local_tests_only => 1,
+ debug => 0,
+ dont_copy_prefs => 1,
+ });
+ ok($sa);
+
+ # remove all rules and plugins; we want just our stuff
+ unlink(<log/test_rules_copy/*.pm>);
+ unlink(<log/test_rules_copy/*.cf>);
+
+ open (OUT, ">log/test_rules_copy/00_test.cf") or die "failed to write rule";
+ print OUT "
+ loadplugin Mail::SpamAssassin::Plugin::BodyRuleBaseExtractor
+ ".$rules;
+ close OUT;
+
+ my ($k, $v);
+ while (($k, $v) = each %{$params}) { $sa->{$k}=$v; }
+
+ $sa->init();
+ ok ($sa->lint_rules() == 0) or warn "lint failed: $rules";
+
+ my $conf = $sa->{conf};
+ my $ruletype = "body_0";
+ foreach my $key1 (sort keys %{$conf->{base_orig}->{$ruletype}}) {
+ print "INPUT: $key1 $conf->{base_orig}->{$ruletype}->{$key1}\n";
+ }
+ my %found = ();
+ foreach my $key (sort keys %{$conf->{base_string}->{$ruletype}}) {
+ my $str = "$key:$conf->{base_string}->{$ruletype}->{$key}";
+ print "BASES: '$str'\n";
+ $found{$str} = 1;
+ }
+
+ # $output ||= [];
+ foreach my $line (@{$output}) {
+ ok($found{$line}) or warn "failed to find '$line'";
+ }
+
+ $notoutput ||= [];
+ foreach my $line (@{$notoutput}) {
+ ok(!$found{$line}) or warn "found '$line' but didn't want to";
+ }
+}
+
+
Propchange: spamassassin/branches/jm_re2c_hacks/t/re_base_extraction.t
------------------------------------------------------------------------------
svn:executable = *