You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2006/11/07 00:01:02 UTC

svn commit: r471922 - in /spamassassin/branches/jm_re2c_hacks: ./ lib/Mail/ lib/Mail/SpamAssassin/Plugin/ rule2xs/ rule2xs/RabinKarpAccel-0.01/ t/

Author: jm
Date: Mon Nov  6 15:01:01 2006
New Revision: 471922

URL: http://svn.apache.org/viewvc?view=rev&rev=471922
Log:
add test cases for base-string extraction; add sa-compile script, used to compile the ruleset; use /var/lib/spamassassin/compiled/VERSION as the location for compiled rulesets

Added:
    spamassassin/branches/jm_re2c_hacks/sa-compile.raw   (with props)
    spamassassin/branches/jm_re2c_hacks/t/re_base_extraction.t   (with props)
Removed:
    spamassassin/branches/jm_re2c_hacks/rule2xs/re2xs
    spamassassin/branches/jm_re2c_hacks/run
Modified:
    spamassassin/branches/jm_re2c_hacks/MANIFEST
    spamassassin/branches/jm_re2c_hacks/Makefile.PL
    spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin.pm
    spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm
    spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/Rule2XSBody.pm
    spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/Makefile.PL

Modified: spamassassin/branches/jm_re2c_hacks/MANIFEST
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/MANIFEST?view=diff&rev=471922&r1=471921&r2=471922
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/MANIFEST (original)
+++ spamassassin/branches/jm_re2c_hacks/MANIFEST Mon Nov  6 15:01:01 2006
@@ -542,3 +542,4 @@
 lib/Mail/SpamAssassin/Plugin/Rule2XSBody.pm
 t/spamd_kill_restart.t
 t/spamd_kill_restart_rr.t
+t/re_base_extraction.t

Modified: spamassassin/branches/jm_re2c_hacks/Makefile.PL
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/Makefile.PL?view=diff&rev=471922&r1=471921&r2=471922
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/Makefile.PL (original)
+++ spamassassin/branches/jm_re2c_hacks/Makefile.PL Mon Nov  6 15:01:01 2006
@@ -167,6 +167,7 @@
       'spamassassin.raw' => 'spamassassin',
       'sa-learn.raw'     => 'sa-learn',
       'sa-update.raw'    => 'sa-update',
+      'sa-compile.raw'    => 'sa-compile',
       'spamc/spamc.c'    => 'spamc/spamc$(EXE_EXT)',
       'spamd/spamd.raw'  => 'spamd/spamd',
     },
@@ -180,6 +181,7 @@
         'lib/spamassassin-run.pod' => '$(INST_MAN1DIR)/spamassassin-run.$(MAN1EXT)',
         'sa-learn'         => '$(INST_MAN1DIR)/sa-learn.$(MAN1EXT)',
         'sa-update'        => '$(INST_MAN1DIR)/sa-update.$(MAN1EXT)',
+        'sa-compile'       => '$(INST_MAN1DIR)/sa-compile.$(MAN1EXT)',
         'spamc/spamc.pod'  => '$(INST_MAN1DIR)/spamc.$(MAN1EXT)',
         'spamd/spamd'      => '$(INST_MAN1DIR)/spamd.$(MAN1EXT)',
     },
@@ -223,7 +225,7 @@
     },
 
     'clean' => { FILES => join(' ' =>
-        'sa-learn', 'sa-update', 'spamassassin',
+        'sa-learn', 'sa-update', 'spamassassin', 'sa-compile',
         
         'spamd/spamd',
 
@@ -1116,6 +1118,9 @@
 
 sa-update: sa-update.raw build_rules
 	$(PREPROCESS) $(FIXBYTES) $(FIXVARS) $(FIXBANG) -m$(PERM_RWX) -isa-update.raw -osa-update
+
+sa-compile: sa-compile.raw
+	$(PREPROCESS) $(FIXBYTES) $(FIXVARS) $(FIXBANG) -m$(PERM_RWX) -isa-compile.raw -osa-compile
 
 spamd/spamd: spamd/spamd.raw
 	$(PREPROCESS) $(FIXBYTES) $(FIXVARS) $(FIXBANG) -m$(PERM_RWX) -i$? -o$@

Modified: spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin.pm?view=diff&rev=471922&r1=471921&r2=471922
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin.pm (original)
+++ spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin.pm Mon Nov  6 15:01:01 2006
@@ -289,7 +289,8 @@
 =item LOCAL_STATE_DIR
 
 Location of the local state directory, mainly used for installing updates via
-C<sa-update>.  Defaults to "@@LOCAL_STATE_DIR@@".
+C<sa-update> and compiling rulesets to native code.  Defaults to
+"@@LOCAL_STATE_DIR@@".
 
 =back
 

Modified: spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm?view=diff&rev=471922&r1=471921&r2=471922
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm (original)
+++ spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm Mon Nov  6 15:01:01 2006
@@ -20,8 +20,8 @@
 
 =head1 SYNOPSIS
 
-This is a work-in-progress plugin to extract "base" strings from SpamAssassin
-'body' rules, suitable for use in Rule2XSBody rules.
+This is a plugin to extract "base" strings from SpamAssassin 'body' rules,
+suitable for use in Rule2XSBody rules or other parallel matching algorithms.
 
 =cut
 
@@ -37,14 +37,17 @@
 use vars qw(@ISA);
 @ISA = qw(Mail::SpamAssassin::Plugin);
 
-# a few settings that control what kind of bases are output:
+# a few settings that control what kind of bases are output.
 
 # treat all rules as lowercase for purposes of term extraction?
-my $BASES_MUST_BE_CASE_I = 1;
-my $BASES_CAN_USE_ALTERNATIONS = 0;    # /(foo|bar|baz)/
-my $BASES_CAN_USE_QUANTIFIERS = 0;     # /foo.*bar/ or /foo*bar/ or /foooo?bar/
-my $BASES_CAN_USE_CHAR_CLASSES = 0;    # /fo[opqr]bar/
-my $SPLIT_OUT_ALTERNATIONS = 1;        # /(foo|bar|baz)/ => ["foo", "bar", "baz"]
+# $main->{bases_must_be_casei} = 1;
+# $main->{bases_can_use_alternations} = 0; # /(foo|bar|baz)/
+# $main->{bases_can_use_quantifiers} = 0; # /foo.*bar/ or /foo*bar/ or /foooo?bar/
+# $main->{bases_can_use_char_classes} = 0; # /fo[opqr]bar/
+# $main->{bases_split_out_alternations} = 1; # /(foo|bar|baz)/ => ["foo", "bar", "baz"]
+
+# TODO: it would be nice to have a clean API to pass such settings
+# through to plugins instead of hanging them off $main
 
 ###########################################################################
 
@@ -70,32 +73,20 @@
 sub extract_bases {
   my ($self, $conf) = @_;
 
-  # TODO: need a better way to do this rather than using an env
-  # var as a back channel
-  my $rawf = $ENV{'RULE_REGEXP_DUMP_FILE'};
-  my $f;
-
-  if ($rawf) {
-    $rawf =~ /^(.*)$/;
-    $f = $1;        # untaint; allow anything here, it's from %ENV and safe
-  }
-  else {
+  my $main = $conf->{main};
+  if (!$main->{base_extract}) {
     return;         # TODO: comment this for Rabin-Karp
   }
 
-  $self->extract_set($f, $conf, $conf->{body_tests}, 'body');
+  $self->extract_set($conf, $conf->{body_tests}, 'body');
 }
 
 sub extract_set {
-  my ($self, $dumpfile, $conf, $test_set, $ruletype) = @_;
+  my ($self, $conf, $test_set, $ruletype) = @_;
 
   foreach my $pri (keys %{$test_set}) {
     my $nicepri = $pri; $nicepri =~ s/-/neg/g;
     $self->extract_set_pri($conf, $test_set->{$pri}, $ruletype.'_'.$nicepri);
-
-    if ($dumpfile) {
-      $self->dump_base_strings($dumpfile, $conf, $ruletype.'_'.$nicepri);
-    }
   }
 }
 
@@ -109,6 +100,8 @@
   my $yes = 0;
   my $no = 0;
 
+  $self->{main} = $conf->{main};	# for use in extract_hints()
+
   dbg("zoom: base extraction start for type $ruletype");
 
   # attempt to find good "base strings" (simplified regexp subsets) for each
@@ -254,27 +247,7 @@
     $conf->{base_string}->{$ruletype}->{$base} = $key;
   }
 
-  warn ("zoom: base extraction complete for $ruletype: yes=$yes no=$no\n");
-}
-
-###########################################################################
-
-sub dump_base_strings {
-  my ($self, $dumpfile, $conf, $ruletype) = @_;
-
-  open (OUT, ">$dumpfile") or die "cannot write to $dumpfile!";
-  print OUT "name $ruletype\n";
-
-  foreach my $key1 (sort keys %{$conf->{base_orig}->{$ruletype}}) {
-    print OUT "orig $key1 $conf->{base_orig}->{$ruletype}->{$key1}\n";
-  }
-
-  foreach my $key (sort keys %{$conf->{base_string}->{$ruletype}}) {
-    print OUT "r $key:$conf->{base_string}->{$ruletype}->{$key}\n";
-  }
-  close OUT or die "close failed on $dumpfile!";
-
-  warn ("zoom: bases written to '$dumpfile'\n");
+  info ("zoom: base extraction complete for $ruletype: yes=$yes no=$no\n");
 }
 
 ###########################################################################
@@ -290,6 +263,7 @@
   my $rule = shift;
   my $is_reversed = shift;
 
+  my $main = $self->{main};
   my $orig = $rule;
   $rule = Mail::SpamAssassin::Util::regexp_remove_delimiters($rule);
 
@@ -310,7 +284,7 @@
   # anyway.  Simplification that causes the regexp to *not* hit
   # stuff that the "real" rule would hit, however, is a bad thing.
 
-  if ($BASES_MUST_BE_CASE_I) {
+  if ($main->{bases_must_be_casei}) {
     $rule = lc $rule;
     $mods =~ s/i//;
 
@@ -355,7 +329,8 @@
   $rule =~ s/\(\?:/\(/g;
 
   # this must be before reversing
-  if ($BASES_CAN_USE_ALTERNATIONS||$SPLIT_OUT_ALTERNATIONS) {
+  if ($main->{bases_can_use_alternations}||$main->{bases_split_out_alternations})
+  {
     # /foo (bar)? baz/ simplify to /foo (bar|) baz/
     $rule =~ s/(?<!\\)(\([^\(\)]*)\)\?/$1\|\)/gs;
 
@@ -380,21 +355,21 @@
               \\[ABCE-RT-VX-Z]
             ).*$//gsx;
 
-  $BASES_CAN_USE_CHAR_CLASSES or $rule =~ s/(?<!\\)(?:
+  $main->{bases_can_use_char_classes} or $rule =~ s/(?<!\\)(?:
               \\\w|
               \.|
               \[|
               \]
             ).*$//gsx;
 
-  $BASES_CAN_USE_QUANTIFIERS or $rule =~ s/(?<!\\)(?:
+  $main->{bases_can_use_quantifiers} or $rule =~ s/(?<!\\)(?:
               .\*|	# remove the quantified char, too
               .\+|
               .\?|
               .\{
             ).*$//gsx;
 
-  ($BASES_CAN_USE_ALTERNATIONS||$SPLIT_OUT_ALTERNATIONS) or
+  ($main->{bases_can_use_alternations}||$main->{bases_split_out_alternations}) or
             $rule =~ s/(?<!\\)(?:
               \(|
               \)
@@ -414,7 +389,7 @@
   # simplify (..)? and (..|) to (..|z{0})
   # this wierd construct is to work around an re2c bug; (..|) doesn't
   # do what it should
-  if ($BASES_CAN_USE_ALTERNATIONS) {
+  if ($main->{bases_can_use_alternations}) {
     $rule =~ s/\((.*?)\)\?/\($1\|z{0}\)/gs;
     $rule =~ s/\((.*?)\|\)/\($1\|z{0}\)/gs;
     $rule =~ s/\(\|(.*?)\)/\($1\|z{0}\)/gs;
@@ -488,7 +463,7 @@
   }
 
   # return for things we know we can't handle.
-  if (!($BASES_CAN_USE_ALTERNATIONS||$SPLIT_OUT_ALTERNATIONS)) {
+  if (!($main->{bases_can_use_alternations}||$main->{bases_split_out_alternations})) {
     if ($rule =~ /\|/) {
       # /time to refinance|refinanc\w{1,3}\b.{0,16}\bnow\b/i
       die "alternations";
@@ -529,7 +504,7 @@
   }
 
   my @rules;
-  if ($SPLIT_OUT_ALTERNATIONS && $rule =~ /\|/) {
+  if ($main->{bases_split_out_alternations} && $rule =~ /\|/) {
     @rules = $self->split_alt($rule);
   }
   else {

Modified: spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/Rule2XSBody.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/Rule2XSBody.pm?view=diff&rev=471922&r1=471921&r2=471922
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/Rule2XSBody.pm (original)
+++ spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/Rule2XSBody.pm Mon Nov  6 15:01:01 2006
@@ -14,6 +14,24 @@
 # limitations under the License.
 # </...@LICENSE>
 
+=head1 NAME
+
+Mail::SpamAssassin::Plugin::Rule2XSBody - speed up SpamAssassin by compiling regexps
+
+=head1 SYNOPSIS
+
+  loadplugin     Mail::SpamAssassin::Plugin::Rule2XSBody
+
+=head1 DESCRIPTION
+
+This plugin will use native-code object files representing the ruleset,
+in order to provide significant speedups in rule evaluation.
+
+Note that C<sa-compile> must be run in advance, in order to compile the
+ruleset using C<re2c> and the C compiler.
+
+=cut
+
 package Mail::SpamAssassin::Plugin::Rule2XSBody;
 
 use Mail::SpamAssassin::Plugin;
@@ -41,6 +59,12 @@
 sub finish_parsing_end {
   my ($self, $params) = @_;
   my $conf = $params->{conf};
+
+  my $instdir = $conf->{main}->sed_path
+			('__local_state_dir__/compiled/__version__');
+  push @INC, $instdir, "$instdir/auto";
+  dbg "zoom: loading compiled ruleset from $instdir";
+
   $self->setup_test_set ($conf, $conf->{body_tests}, 'body');
 }
 
@@ -97,9 +121,10 @@
 
     my $totalhasrules = scalar keys %{$hasrules};
     my $pc_zoomed   = ($found / ($totalhasrules || .001)) * 100;
+    $pc_zoomed   = int($pc_zoomed * 1000) / 1000;
 
-    dbg("zoom: $found compiled rules are available for type $ruletype; ".
-        "$pc_zoomed\% were usable");
+    dbg("zoom: $found compiled rules are available for type $ruletype out ".
+        "of $totalhasrules ($pc_zoomed\%)");
 
     $conf->{zoom_ruletypes_available} ||= { };
     $conf->{zoom_ruletypes_available}->{$ruletype} = 1;

Modified: spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/Makefile.PL
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/Makefile.PL?view=diff&rev=471922&r1=471921&r2=471922
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/Makefile.PL (original)
+++ spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/Makefile.PL Mon Nov  6 15:01:01 2006
@@ -8,7 +8,7 @@
     PREREQ_PM         => {}, # e.g., Module::Name => 1.1
     ($] >= 5.005 ?     ## Add these new keywords supported since 5.005
       (ABSTRACT_FROM  => 'lib/RabinKarpAccel.pm', # retrieve abstract from module
-       AUTHOR         => 'A. U. Thor <jm@>') : ()),
+       AUTHOR         => 'Justin Mason <jm...@jmason.org>') : ()),
     LIBS              => [''], # e.g., '-lm'
     DEFINE            => '', # e.g., '-DHAVE_SOMETHING'
     INC               => '-I.', # e.g., '-I. -I/usr/include/other'

Added: spamassassin/branches/jm_re2c_hacks/sa-compile.raw
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/sa-compile.raw?view=auto&rev=471922
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/sa-compile.raw (added)
+++ spamassassin/branches/jm_re2c_hacks/sa-compile.raw Mon Nov  6 15:01:01 2006
@@ -0,0 +1,859 @@
+#!/usr/bin/perl -w -T
+
+# <@LICENSE>
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at:
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </...@LICENSE>
+
+my $PREFIX          = '@@PREFIX@@';             # substituted at 'make' time
+my $DEF_RULES_DIR   = '@@DEF_RULES_DIR@@';      # substituted at 'make' time
+my $LOCAL_RULES_DIR = '@@LOCAL_RULES_DIR@@';    # substituted at 'make' time
+my $LOCAL_STATE_DIR = '@@LOCAL_STATE_DIR@@';    # substituted at 'make' time
+use lib '@@INSTALLSITELIB@@';                   # substituted at 'make' time
+
+use File::Spec;
+
+BEGIN {
+  my @bin = File::Spec->splitpath($0);
+  my $bin = ($bin[0] ? File::Spec->catpath(@bin[0..1]) : $bin[1])
+            || File::Spec->curdir;
+
+  if (-e $bin.'/lib/Mail/SpamAssassin.pm'
+        || !-e '@@INSTALLSITELIB@@/Mail/SpamAssassin.pm')
+  {
+    if ( $bin eq '../' && -e '../blib/lib/Mail/SpamAssassin.pm' ) {
+      unshift(@INC, '../blib/lib');
+    }
+    else {
+      foreach (qw(lib ../lib/site_perl
+                  ../lib/spamassassin ../share/spamassassin/lib))
+      {
+        my $dir = File::Spec->catdir($bin, split('/', $_));
+        if(-f File::Spec->catfile($dir, "Mail", "SpamAssassin.pm")) {
+          unshift(@INC, $dir); last;
+        }
+      }
+    }
+  }
+}
+
+use strict;
+use warnings;
+
+use Mail::SpamAssassin;
+use Getopt::Long;
+use File::Copy;
+use File::Path;
+
+use vars qw( %opt );
+Mail::SpamAssassin::Util::clean_path_in_taint_mode();
+Mail::SpamAssassin::Util::untaint_var( \%ENV );
+
+##############################################################################
+
+Getopt::Long::Configure(
+  qw(bundling no_getopt_compat
+    permute no_auto_abbrev no_ignore_case)
+);
+
+GetOptions(
+  'list'		=> \$opt{'list'},
+
+  'configpath|config-file|config-dir|c|C=s' => \$opt{'configpath'},
+  'prefspath|prefs-file|p=s'                => \$opt{'prefspath'},
+  'siteconfigpath=s'                        => \$opt{'siteconfigpath'},
+  'cf=s'                                    => \@{$opt{'cf'}},
+  'debug-level|D:s' => \$opt{'debug'},
+  'help|h|?'        => \$opt{'help'},
+  'version|V'       => \$opt{'version'},
+  )
+  or usage( 0, "Unknown option!" );
+
+if ( defined $opt{'help'} ) {
+  usage( 0, "For more information read the manual page" );
+}
+if ( defined $opt{'version'} ) {
+  print "SpamAssassin version " . Mail::SpamAssassin::Version() . "\n";
+  exit 0;
+}
+
+# set debug areas, if any specified (only useful for command-line tools)
+if (defined $opt{'debug'}) {
+  $opt{'debug'} ||= 'all';
+}
+
+# ensure the body-rule base extractor plugin is loaded, we use that
+my $post_config = q(
+  loadplugin Mail::SpamAssassin::Plugin::BodyRuleBaseExtractor
+).join("\n", @{$opt{'cf'}})."\n";
+
+my $spamtest = new Mail::SpamAssassin(
+  {
+    rules_filename      => $opt{'configpath'},
+    site_rules_filename => $opt{'siteconfigpath'},
+    userprefs_filename  => $opt{'prefspath'},
+    debug               => $opt{'debug'},
+    local_tests_only    => 1,
+    dont_copy_prefs     => 1,
+    PREFIX              => $PREFIX,
+    DEF_RULES_DIR       => $DEF_RULES_DIR,
+    LOCAL_RULES_DIR     => $LOCAL_RULES_DIR,
+    LOCAL_STATE_DIR     => $LOCAL_STATE_DIR,
+    post_config_text    => $post_config,
+  }
+);
+
+# appropriate settings for rule2xs usage
+$spamtest->{base_extract} = 1;
+$spamtest->{bases_must_be_casei} = 1;
+$spamtest->{bases_can_use_alternations} = 0;
+$spamtest->{bases_can_use_quantifiers} = 0;
+$spamtest->{bases_can_use_char_classes} = 0;
+$spamtest->{bases_split_out_alternations} = 0;
+
+$spamtest->init(1);
+
+# this actually extracts the base rules in the plugin, as a side-effect
+my $res = $spamtest->lint_rules();
+
+if ($res) {
+  die "not compiling; 'spamassassin --lint' check failed!\n";
+}
+
+if ( defined $opt{'list'} ) {
+  print dump_base_strings();
+}
+else {
+  compile_base_strings();
+}
+
+$spamtest->finish();
+exit;
+
+##############################################################################
+
+sub dump_base_strings {
+  my $conf = $spamtest->{conf};
+
+  my $s = '';
+  foreach my $ruletype (sort keys %{$conf->{base_orig}}) {
+    $s .= "name $ruletype\n";
+
+    foreach my $key1 (sort keys %{$conf->{base_orig}->{$ruletype}}) {
+      $s .= "orig $key1 $conf->{base_orig}->{$ruletype}->{$key1}\n";
+    }
+
+    foreach my $key (sort keys %{$conf->{base_string}->{$ruletype}}) {
+      $s .= "r $key:$conf->{base_string}->{$ruletype}->{$key}\n";
+    }
+  }
+  return $s;
+}
+
+##############################################################################
+
+sub compile_base_strings {
+  my $dirpath = Mail::SpamAssassin::Util::secure_tmpdir();
+  die "secure_tmpdir failed" unless $dirpath && -w $dirpath;
+
+  open OUT, ">$dirpath/bases.in"
+	or die "cannot write to $dirpath/bases.in";
+  print OUT dump_base_strings();
+  close OUT or die "cannot write to $dirpath/bases.in";
+
+  chdir $dirpath; print "cd $dirpath\n";
+  rule2xs("bases.in");
+
+  my $instdir = $spamtest->sed_path('__local_state_dir__/compiled/__version__');
+
+  run(get_perl()." Makefile.PL ".
+	"PREFIX=$dirpath/ignored INSTALLSITEARCH=$instdir");
+
+  run("make install");	# into $instdir
+}
+
+sub run {
+  my @cmd = @_;
+  print join(' ',@cmd)."\n";
+  system(@cmd);
+  ($?>>8 != 0) and die "command failed!";
+}
+
+sub get_perl {
+  my $perl;
+  if ($^X =~ m|^/|) {
+    $perl = $^X;
+  } else {
+    use Config;
+    $perl = $Config{perlpath};
+    $perl =~ s|/[^/]*$|/$^X|;
+  }
+  $perl =~ /^(.*)$/;
+  return $1;
+}
+
+##############################################################################
+
+use constant MAX_RULES_PER_C_FILE => 200;
+
+sub rule2xs {
+  my $modname;
+  my $force = 1;
+  my $FILE = shift;
+
+  open(my $fh, "sort $FILE |") || die "open($FILE): $!";
+# read ruleset name from the first line in the file
+  my $ruleset_name;
+  $_ = <$fh>;
+  if (/^name\s+(\S+)/) {
+    $ruleset_name = $1;
+  }
+
+  if (!$modname) {
+    $modname = "Mail::SpamAssassin::CompiledRegexps::$ruleset_name";
+  }
+
+  our $PATH = $modname;
+  $PATH =~ s/::/-/g;
+  our $PMFILE = $modname;
+  $PMFILE =~ s/.*:://;
+  $PMFILE .= ".pm";
+  our $XSFILE = $PMFILE;
+  $XSFILE =~ s/\.pm$/.xs/;
+
+  $force and rmtree $PATH;
+  mkdir $PATH or (!$force and die "mkdir($PATH): $!");
+  chdir $PATH; print "cd $PATH\n";
+
+  my $cprefix = $modname; $cprefix =~ s/[^A-ZA-z0-9]+/_/gs;
+
+  my $numscans = 0;
+  my (@dot_star, @dot_plus);
+
+  my $has_rules = '';
+
+  while (!eof($fh)) {
+    $numscans++;
+
+    open(my $re, ">scanner${numscans}.re") || die "open(>scanner{$numscans}.re): $!";
+
+    print $re <<EOT;
+#define NULL            ((char*) 0)
+#define YYCTYPE         unsigned char
+#define YYCURSOR        *p
+#define YYLIMIT         *p
+#define YYMARKER        q
+#define YYFILL(n)
+EOT
+
+    print $re <<EOT;
+char *${cprefix}_scan${numscans}(unsigned char **p){
+unsigned char *q;
+/*!re2c
+EOT
+
+    my $line = 0;
+    while (<$fh>) {
+      next if /^#/;
+
+      if (/^orig\s+(\S+)\s+(.*)$/) {
+	my $name = $1;
+	my $regexp = $2;
+	$name =~ s/#/[hash]/gs;
+	$regexp =~ s/#/[hash]/gs;
+	$has_rules .= "  q#$name# => q#$regexp#,\n";
+	next;
+      }
+
+      my ($regexp, $reason) = /^r (.*):(.*)$/;
+      die "no 'r REGEXP:REASON' in $_" unless defined $regexp;
+
+      if ($regexp =~ /^\.\*/) {
+	push @dot_star, "$regexp:$reason";
+	next;
+      }
+      elsif ($regexp =~ /^\.\+/) {
+	push @dot_plus, "$regexp:$reason";
+	next;
+      }
+      eval {
+	print $re "\t", fixup_re($regexp), "            {return \"$reason\";}\n";
+	$line++;
+      };
+      $@ and handle_fixup_error($@, $regexp, $reason);
+      last if $line == MAX_RULES_PER_C_FILE;
+    }
+
+    print $re <<EOT;
+  [\\000-\\377]        { return NULL; }
+*/
+}
+EOT
+
+    #last if $numscans == 2;
+  }
+
+  while (@dot_star) {
+    $numscans++;
+
+    open(my $re, ">scanner${numscans}.re") || 
+	die "open(>scanner{$numscans}.re): $!";
+
+    print $re <<EOT;
+#define NULL            ((char*) 0)
+#define YYCTYPE         unsigned char
+#define YYCURSOR        *p
+#define YYLIMIT         *p
+#define YYMARKER        q
+#define YYFILL(n)
+EOT
+
+    print $re <<EOT;
+char *${cprefix}_scan${numscans}(unsigned char **p){
+unsigned char *q;
+start:
+/*!re2c
+EOT
+
+    my $line = 0;
+    while ($_ = shift @dot_star) {
+      my ($regexp, $reason) = /^(.*):(.*)$/;
+      $regexp =~ s/^\.\*//;
+      eval {
+	print $re "\t", fixup_re($regexp), "             {return \"$reason\";}\n";
+	$line++;
+      };
+      $@ and handle_fixup_error($@, $regexp, $reason);
+      last if $line == MAX_RULES_PER_C_FILE;
+    }
+    
+    print $re <<EOT;
+  [\\001-\\377]    { goto start; }
+  [\\000]        {return NULL; }
+*/
+}
+EOT
+  }
+
+  while (@dot_plus) {
+    $numscans++;
+
+    open(my $re, ">scanner${numscans}.re") || 
+	die "open(>scanner{$numscans}.re): $!";
+    
+    print $re <<EOT;
+#define NULL            ((char*) 0)
+#define YYCTYPE         unsigned char
+#define YYCURSOR        *p
+#define YYLIMIT         *p
+#define YYMARKER        q
+#define YYFILL(n)
+EOT
+
+    print $re <<EOT;
+char *${cprefix}_scan${numscans}(unsigned char **p){
+unsigned char *q;
+p++;
+start:
+/*!re2c
+EOT
+    
+    my $line = 0;
+    while ($_ = shift @dot_plus) {
+      my ($regexp, $reason) = /^(.*):(.*)$/;
+      $regexp =~ s/^\.\+//;
+
+      eval {
+	print $re "\t", fixup_re($regexp), "             {return \"$reason\";}\n";
+	$line++;
+      };
+      $@ and handle_fixup_error($@, $regexp, $reason);
+      last if $line == MAX_RULES_PER_C_FILE;
+    }
+    
+    print $re <<EOT;
+  [\\001-\\377]    { goto start; }
+  [\\000]        {return NULL; }
+*/
+
+}
+EOT
+  }
+
+  for (1..$numscans) {
+    # print "[re2c for block $_ / $numscans]\n";
+
+    my $cmd = "re2c -i -b -o scanner$_.c scanner$_.re";
+    run($cmd);
+
+    # this must be fatal; it can result in corrupt output modules missing
+    # scannerN() functions
+    if ($? >> 8 != 0) {
+      my $cwd = `pwd`; chop $cwd;
+      die "'$cmd' failed, dying!\n".
+	  "see $cwd/scanner$_.re\n";
+    }
+  }
+
+  open(FILE, ">Makefile.PL") || die "write Makefile.PL: $!";
+  print FILE <<"EOT";
+    use ExtUtils::MakeMaker;
+
+    WriteMakefile(
+	'NAME' => '$modname',
+	'VERSION_FROM' => '$PMFILE',
+	'ABSTRACT_FROM' => '$PMFILE',
+	'OBJECT' => '\$(O_FILES)',
+	'OPTIMIZE' => '-O2',
+	'AUTHOR' => 'A. U. Tomated <au...@example.com>',
+    );
+EOT
+
+  open(FILE, ">MANIFEST.SKIP") || die "write MANIFEST.SKIP: $!";
+  print FILE <<'EOT';
+CVS/.*
+\.bak$
+\.sw[a-z]$
+\.tar$
+\.tgz$
+\.tar\.gz$
+\.o$
+\.xsi$
+\.bs$
+^.#
+^tmp/
+^blib/
+^Makefile$
+^Makefile\.[a-z]+$
+^pm_to_blib$
+~$
+EOT
+
+  open(my $re, ">$XSFILE") || die "write $XSFILE: $!";
+  print $re <<"EOT";
+#include "EXTERN.h"
+#include "perl.h"
+#include "XSUB.h"
+
+  /* split single-space-separated result string */
+  static void
+  split_and_add (AV *results, char *match)
+  {
+      char *wordstart, *cp;
+
+      for (cp = wordstart = match; *cp != (unsigned char) 0; cp++) {
+	if (*cp == ' ') {
+	  av_push(results,
+	      newSVpvn_share(wordstart, cp-wordstart, (U32)0));
+	  wordstart = cp + 1;
+	}
+      }
+      av_push(results,
+	      newSVpvn_share(wordstart, cp-wordstart, (U32)0));
+  }
+
+MODULE = $modname  PACKAGE = $modname
+
+PROTOTYPES: DISABLE
+
+SV *
+scan(psv)
+	SV* psv
+
+  PREINIT:
+	int i;
+	char *match;
+	unsigned char *cursor;
+	unsigned char *pstart;
+	unsigned char *pend;
+	STRLEN plen;
+	AV *results;
+
+  CODE:
+	pstart = (unsigned char *) SvPVutf8(psv, plen);
+	pend = pstart + plen;
+	results = (AV *) sv_2mortal((SV *) newAV());
+EOT
+
+  for (1..$numscans) {
+    my $funcname = $cprefix."_scan".$_;
+
+    print $re <<EOT;
+	  extern char *${funcname} (unsigned char **);
+
+	  cursor = pstart;
+	  while (cursor < pend) {
+	    while (match = ${funcname} (\&cursor)) {
+	      split_and_add(results, match);
+	    }
+	  }
+EOT
+
+  }
+
+  print $re <<EOT;
+	  RETVAL = newRV((SV *) results);
+      OUTPUT:
+	  RETVAL
+
+EOT
+
+  close($re);
+
+  open(FILE, ">$PMFILE") || die "write $PMFILE: $!";
+  print FILE <<"EOT";
+
+package $modname;
+
+use strict;
+use vars qw(\$VERSION \@ISA \@EXPORT_OK);
+
+use DynaLoader ();
+
+BEGIN {
+\$VERSION = '1.0';
+\@ISA = qw(DynaLoader);
+\@EXPORT_OK = qw();
+
+our \$HAS_RULES = {
+  $has_rules
+};
+
+bootstrap $modname \$VERSION;
+}
+
+1;
+__END__
+
+=head1 NAME
+
+$modname - Efficient string matching for regexps found in $FILE
+
+=head1 SYNOPSIS
+
+  use $modname;
+  
+  ...
+  my \$match = ${modname}::scan(\$string);
+
+=head1 DESCRIPTION
+
+This module was created by SpamAssassin with the aid of re2xs, which uses re2c
+to create an XS library capable of scanning through a bunch of regular
+expressions as defined in F<$FILE>.
+
+See C<sa-compile> for more details.
+
+=cut
+EOT
+
+}
+
+sub fixup_re {
+  my $re = shift;
+  
+  # print "FIXUP: $re\n";
+  
+  my $output = "";
+  my $TOK = qr([\{\^\$\"\(\)\[\|\\\.\+\*\?]);
+  
+  my $STATE;
+  while ($re =~ /\G(.*?)($TOK)/gc) {
+    my $pre = $1;
+    my $tok = $2;
+
+    if (length($pre)) {
+      $output .= "\"$pre\"";
+    }
+
+    if ($tok eq "(") {
+      # Grouping
+      my $origpos = pos $re;
+      if ($re =~ /\G(.*)\)/gc) {
+        # trim that down, if necessary, to just the matching
+        # (...) substring.
+        my $subre = find_balanced_group($1);
+        $output .= "( " . fixup_re($subre) . " )";
+        # reset m//g offset to the end of that group, so that
+        # the next token is taken from the right place
+        pos $re = $origpos + length($subre) + 1;
+      }
+      else {
+        die "re: $re doesn't have group closing bracket";
+      }
+    }
+    elsif ($tok eq "|") {
+      $output .= " | ";
+    }
+    elsif ($tok eq ")") {
+      $output .= " ) ";
+    }
+    elsif ($tok eq "[") {
+      # chars
+      if ($re =~ /\G(.*?)\]/gc) {
+        $output .= "[$1]";
+      }
+      else {
+        die "re: $re doesn't have character class closing bracket";
+      }
+    }
+    elsif ($tok eq '{') {
+      if ($re =~ /\G(.*?)\}/gc) {
+        $output .= "{$1}";
+      }
+      else {
+        die "re: $re doesn't have quantifier closing bracket";
+      }
+    }
+    elsif ($tok eq '.') {
+      $output .= '.';
+    }
+    elsif ($tok eq '"') {
+      $output .= '"\\';     # reversed later to form \"
+    }
+    elsif ($tok eq '*') {
+      # /guaranteed*/ => "guarantee" "d"* , not "guaranteed"*
+      $output =~ s/(.)\"\s*$/\" \"$1\"/;
+      $output .= '* ';
+    }
+    elsif ($tok eq '?') {
+      # /guaranteed?/ => "guarantee" "d"? , not "guaranteed"?
+      $output =~ s/(.)\"\s*$/\" \"$1\"/;
+      $output .= '? ';
+    }
+    elsif ($tok eq '+') {
+      # /guaranteed+/ => "guarantee" "d"+ , not "guaranteed"+
+      $output =~ s/(.)\"\s*$/\" \"$1\"/;
+      $output .= '+ ';
+    }
+    elsif ($tok eq "^") {
+      die "Unsupported anchor: \\$tok";
+    }
+    elsif ($tok eq "\$") {
+      die "Unsupported anchor: \\$tok";
+    }
+    elsif ($tok eq '\\') {
+      $re =~ /\G(.)/gc or die "\\ at end of string!";
+      my $esc = $1;
+      if ($esc !~ /^
+                [\.\@\$\(\)\/\-\+\*\^\?\!_]
+                $/x)
+      {
+        die "Unsupported escape: \\$esc";
+      }
+      $output .= "\"$esc\"";
+    }
+    else {
+      print "PRE: $pre\nTOK: $tok\n";
+    }
+  }
+  
+  if (!defined(pos($re))) {
+    # no matches
+    $output .= "\"$re\"";
+  }
+  elsif (pos($re) <= length($re)) {
+    $output .= fixup_re(substr($re, pos($re)));
+  }
+  
+  $output =~ s/""//g; # strip empty strings, or turn "abc""def" -> "abcdef"
+  # print "OUTPUT: $output\n";
+  return $output;
+}
+
+sub handle_fixup_error {
+  my ($strat, $regexp, $reason) = @_;
+  if ($strat) {
+    warn "skipped: $regexp: $strat";
+  }
+}
+
+sub find_balanced_group {
+  my ($re) = @_;
+
+  # $re could be:
+  #
+  #   foo
+  #   foo|bar|baz
+  #   foo(aa|bb|cc)|bar|baz
+  #   foo(aa|(bb|cc|ddd))|bar(e|f)
+  #
+  # or, due to inefficient regexp parsing above:
+  #
+  #   foo) totally_unrelated_text (next_grouping => "foo"
+  #
+  # all we have to do here is ensure that we correctly pick out
+  # the balanced group.
+
+  my $output = "";
+  my $TOK = qr([\(\\\)]);
+  my $level = 0;
+  
+  my $STATE;
+  while ($re =~ /\G(.*?)($TOK)/gc) {
+    my $pre = $1;
+    my $tok = $2;
+
+    if (length($pre)) {
+      $output .= $pre;
+    }
+
+    if ($tok eq "(") {
+      $output .= $tok;
+      $level++;
+    }
+    elsif ($tok eq ")") {
+      $level--;
+      if ($level < 0) {
+        # we found the matching close-bracket; stop searching here
+        # warn "JMD partial $re => $output";
+        return $output;
+      }
+      $output .= $tok;
+    }
+    elsif ($tok eq '\\') {
+      $re =~ /\G(.)/gc or die "\\ at end of string!";
+      my $esc = $1;
+      $output .= $tok.$esc;
+    }
+    else {
+      $output .= $tok;
+    }
+  }
+
+  if (!defined(pos($re))) {
+    $output .= $re;
+  }
+  elsif (pos($re) <= length($re)) {
+    $output .= substr($re, pos($re));
+  }
+
+  # we hit end-of-string; in other words, it was a balanced re anyway
+  # warn "JMD fully balanced $re";
+  return $re;
+}
+
+##############################################################################
+
+=cut
+
+=head1 NAME
+
+sa-compile - compile SpamAssassin ruleset into native code
+
+=head1 SYNOPSIS
+
+B<sa-compile> [options]
+
+Options:
+
+  --list                        Output base string list to STDOUT
+
+  -C path, --configpath=path, --config-file=path   Path to standard configuration dir
+  -p prefs, --prefspath=file, --prefs-file=file    Set user preferences file
+  --siteconfigpath=path         Path for site configs (def: /etc/mail/spamassassin)
+  --cf='config line'            Additional line of configuration
+
+  -D, --debug [area=n,...]	Print debugging messages
+  -V, --version			Print version
+  -h, --help			Print usage message
+
+=head1 DESCRIPTION
+
+sa-compile uses C<re2c> to compile the SpamAssassin ruleset into C code in a
+Perl XS module, and from there into native object code.  This will then be used
+by the C<Mail::SpamAssassin::Plugin::Rule2XSBody> plugin to speed up
+SpamAssassin's operation, where possible.
+
+This requires C<re2c> (see C<http://re2c.org/>), and the C compiler used to
+build Perl XS modules, be installed.
+
+=head1 OPTIONS
+
+=over 4
+
+=item B<--list>
+
+Output the extracted base strings to STDOUT, instead of generating
+the C extension code.
+
+=item B<-C> I<path>, B<--configpath>=I<path>, B<--config-file>=I<path>
+
+Use the specified path for locating the distributed configuration files.
+Ignore the default directories (usually C</usr/share/spamassassin> or similar).
+
+=item B<--siteconfigpath>=I<path>
+
+Use the specified path for locating site-specific configuration files.  Ignore
+the default directories (usually C</etc/mail/spamassassin> or similar).
+
+=item B<--cf='config line'>
+
+Add additional lines of configuration directly from the command-line, parsed
+after the configuration files are read.   Multiple B<--cf> arguments can be
+used, and each will be considered a separate line of configuration.
+
+=item B<-p> I<prefs>, B<--prefspath>=I<prefs>, B<--prefs-file>=I<prefs>
+
+Read user score preferences from I<prefs> (usually
+C<$HOME/.spamassassin/user_prefs>) .
+
+=item B<-D> [I<area,...>], B<--debug> [I<area,...>]
+
+Produce debugging output.  If no areas are listed, all debugging information is
+printed.  Diagnostic output can also be enabled for each area individually;
+I<area> is the area of the code to instrument.
+
+For more information about which areas (also known as channels) are available,
+please see the documentation at:
+
+	C<http://wiki.apache.org/spamassassin/DebugChannels>
+
+=item B<-h>, B<--help>
+
+Print help message and exit.
+
+=item B<-V>, B<--version>
+
+Print sa-compile version and exit.
+
+=back
+
+=head1 SEE ALSO
+
+Mail::SpamAssassin(3)
+spamassassin(1)
+spamd(1)
+
+=head1 PREREQUESITES
+
+C<Mail::SpamAssassin>
+C<re2c>
+
+=head1 BUGS
+
+See <http://issues.apache.org/SpamAssassin/>
+
+=head1 AUTHORS
+
+The Apache SpamAssassin(tm) Project <http://spamassassin.apache.org/>
+
+=head1 COPYRIGHT
+
+SpamAssassin is distributed under the Apache License, Version 2.0, as
+described in the file C<LICENSE> included with the distribution.
+
+=cut
+

Propchange: spamassassin/branches/jm_re2c_hacks/sa-compile.raw
------------------------------------------------------------------------------
    svn:executable = *

Added: spamassassin/branches/jm_re2c_hacks/t/re_base_extraction.t
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/t/re_base_extraction.t?view=auto&rev=471922
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/t/re_base_extraction.t (added)
+++ spamassassin/branches/jm_re2c_hacks/t/re_base_extraction.t Mon Nov  6 15:01:01 2006
@@ -0,0 +1,134 @@
+#!/usr/bin/perl
+
+# Test regular expression base-string extraction in
+# Mail::SpamAssassin::Plugin::BodyRuleBaseExtractor
+
+use lib '.'; use lib 't';
+use SATest; sa_t_init("re_base_extraction");
+use Test;
+use strict;
+use warnings;
+
+BEGIN { 
+  if (-e 't/test_dir') { chdir 't'; } 
+  if (-e 'test_dir') { unshift(@INC, '../blib/lib'); }
+
+  plan tests => 20;
+
+};
+
+try_extraction ('
+    body FOO /foo bar/
+    body EXCUSE_REMOVE /to be removed from.{0,20}(?:mailings|offers)/i
+    body KAM_STOCKTIP15 /(?:Nano Superlattice Technology|NSLT)/is
+    body TEST1 /foo(?:ish)? bar/
+    body TEST2 /foody* bar/
+    body TEST3 /foody? bar/
+    body TEST4 /A(?i:ct) N(?i:ow)/
+    body TEST5 /time to refinance|refinanc\w{1,3}\b.{0,16}\bnow\b/i
+
+', {
+    base_extract => 1,
+    bases_must_be_casei => 1,
+    bases_can_use_alternations => 0,
+    bases_can_use_quantifiers => 0,
+    bases_can_use_char_classes => 0,
+    bases_split_out_alternations => 1
+}, [
+
+    'foo bar:TEST1 FOO',
+    'to be removed from:EXCUSE_REMOVE',
+    'nslt:KAM_STOCKTIP15',
+    'nano superlattice technology:KAM_STOCKTIP15',
+    'fooish bar:TEST1',
+    'act now:TEST4',
+    'food:TEST2',
+    'food bar:TEST3 TEST2',
+    'foody bar:TEST3 TEST2',
+    'refinanc:TEST5',
+    'time to refinance:TEST5',
+
+
+]);
+
+try_extraction ('
+    body FOO /foo bar/
+    body EXCUSE_REMOVE /to be removed from.{0,20}(?:mailings|offers)/i
+    body KAM_STOCKTIP15 /(?:Nano Superlattice Technology|NSLT)/is
+    body TEST1 /foo(?:ish)? bar/
+
+', {
+    base_extract => 1,
+    bases_must_be_casei => 1,
+    bases_can_use_alternations => 0,
+    bases_can_use_quantifiers => 0,
+    bases_can_use_char_classes => 0,
+    bases_split_out_alternations => 0
+}, [
+
+    'foo bar:FOO',
+    'to be removed from:EXCUSE_REMOVE',
+],[
+
+    'foo bar:FOO TEST1',
+    'nano superlattice technology:KAM_STOCKTIP15',
+    'fooish bar:TEST1'
+
+]);
+###########################################################################
+
+use Mail::SpamAssassin;
+
+sub try_extraction {
+  my ($rules, $params, $output, $notoutput) = @_;
+
+  my $sa = Mail::SpamAssassin->new({
+    rules_filename => "log/test_rules_copy",
+    site_rules_filename => "log/test_default.cf",
+    userprefs_filename  => "log/userprefs.cf",
+    local_tests_only    => 1,
+    debug             => 0,
+    dont_copy_prefs   => 1,
+  });
+  ok($sa);
+
+  # remove all rules and plugins; we want just our stuff
+  unlink(<log/test_rules_copy/*.pm>);
+  unlink(<log/test_rules_copy/*.cf>);
+
+  open (OUT, ">log/test_rules_copy/00_test.cf") or die "failed to write rule";
+  print OUT "
+    loadplugin Mail::SpamAssassin::Plugin::BodyRuleBaseExtractor
+    ".$rules;
+  close OUT;
+
+  my ($k, $v);
+  while (($k, $v) = each %{$params}) { $sa->{$k}=$v; }
+
+  $sa->init();
+  ok ($sa->lint_rules() == 0) or warn "lint failed: $rules";
+
+  my $conf = $sa->{conf};
+  my $ruletype = "body_0";
+  foreach my $key1 (sort keys %{$conf->{base_orig}->{$ruletype}}) {
+    print "INPUT: $key1 $conf->{base_orig}->{$ruletype}->{$key1}\n";
+  }
+  my %found = ();
+  foreach my $key (sort keys %{$conf->{base_string}->{$ruletype}}) {
+    my $str = "$key:$conf->{base_string}->{$ruletype}->{$key}";
+    print "BASES: '$str'\n";
+    $found{$str} = 1;
+  }
+
+  # $output ||= [];
+  foreach my $line (@{$output}) {
+    ok($found{$line}) or warn "failed to find '$line'";
+  }
+
+  $notoutput ||= [];
+  foreach my $line (@{$notoutput}) {
+    ok(!$found{$line}) or warn "found '$line' but didn't want to";
+  }
+}
+
+

Propchange: spamassassin/branches/jm_re2c_hacks/t/re_base_extraction.t
------------------------------------------------------------------------------
    svn:executable = *