You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2006/11/24 18:29:43 UTC

svn commit: r478926 [1/2] - in /spamassassin/branches/jm_re2c_hacks: ./ build/ build/automc/ build/buildbot/ build/mkupdates/ lib/Mail/ lib/Mail/SpamAssassin/ lib/Mail/SpamAssassin/Plugin/ masses/ masses/rule-qa/ masses/rule-qa/automc/ rules/ t/

Author: jm
Date: Fri Nov 24 09:29:40 2006
New Revision: 478926

URL: http://svn.apache.org/viewvc?view=rev&rev=478926
Log:
merged to svn trunk's head with: 'svn merge -r475397:478919 https://svn.apache.org/repos/asf/spamassassin/trunk'

Added:
    spamassassin/branches/jm_re2c_hacks/build/parse-rules-for-masses
      - copied unchanged from r478919, spamassassin/trunk/build/parse-rules-for-masses
    spamassassin/branches/jm_re2c_hacks/masses/rule-qa/automc/ruleqa.css
      - copied unchanged from r478919, spamassassin/trunk/masses/rule-qa/automc/ruleqa.css
Removed:
    spamassassin/branches/jm_re2c_hacks/masses/parse-rules-for-masses
Modified:
    spamassassin/branches/jm_re2c_hacks/MANIFEST
    spamassassin/branches/jm_re2c_hacks/MANIFEST.SKIP
    spamassassin/branches/jm_re2c_hacks/README
    spamassassin/branches/jm_re2c_hacks/build/automc/etc-apache-local-conf-httpd.conf
    spamassassin/branches/jm_re2c_hacks/build/buildbot/master.cfg
    spamassassin/branches/jm_re2c_hacks/build/mkupdates/run_part2
    spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin.pm
    spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/ArchiveIterator.pm
    spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm
    spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/Check.pm
    spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/P595Body.pm
    spamassassin/branches/jm_re2c_hacks/masses/Makefile
    spamassassin/branches/jm_re2c_hacks/masses/find-extremes
    spamassassin/branches/jm_re2c_hacks/masses/generate-translation
    spamassassin/branches/jm_re2c_hacks/masses/hit-frequencies
    spamassassin/branches/jm_re2c_hacks/masses/logs-to-c
    spamassassin/branches/jm_re2c_hacks/masses/mass-check
    spamassassin/branches/jm_re2c_hacks/masses/mk-roc-graphs
    spamassassin/branches/jm_re2c_hacks/masses/rewrite-cf-with-new-scores
    spamassassin/branches/jm_re2c_hacks/masses/rule-qa/automc/ruleqa.cgi
    spamassassin/branches/jm_re2c_hacks/masses/rule-qa/corpus-nightly
    spamassassin/branches/jm_re2c_hacks/masses/score-ranges-from-freqs
    spamassassin/branches/jm_re2c_hacks/rules/20_head_tests.cf
    spamassassin/branches/jm_re2c_hacks/rules/20_ratware.cf
    spamassassin/branches/jm_re2c_hacks/rules/50_scores.cf
    spamassassin/branches/jm_re2c_hacks/rules/active.list
    spamassassin/branches/jm_re2c_hacks/rules/v320.pre
    spamassassin/branches/jm_re2c_hacks/sa-compile.raw
    spamassassin/branches/jm_re2c_hacks/sa-learn.raw
    spamassassin/branches/jm_re2c_hacks/sa-update.raw
    spamassassin/branches/jm_re2c_hacks/t/meta.t

Modified: spamassassin/branches/jm_re2c_hacks/MANIFEST
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/MANIFEST?view=diff&rev=478926&r1=478925&r2=478926
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/MANIFEST (original)
+++ spamassassin/branches/jm_re2c_hacks/MANIFEST Fri Nov 24 09:29:40 2006
@@ -16,12 +16,10 @@
 build/convert_pods_to_doc
 build/get_version
 build/md5sum.pl
+build/mkrules
 build/preprocessor
 build/sha1sum.pl
-contrib/check_spamd
-contrib/mbox-to-check
-contrib/run-corpora
-contrib/run-masses
+build/parse-rules-for-masses
 ldap/README
 ldap/README.testing
 ldap/sa_test.ldif
@@ -112,76 +110,6 @@
 lib/Mail/SpamAssassin/Util/RegistrarBoundaries.pm
 lib/Mail/SpamAssassin/Util/TieOneStringHash.pm
 lib/spamassassin-run.pod
-masses/CORPUS_POLICY
-masses/CORPUS_SUBMIT
-masses/CORPUS_SUBMIT_NIGHTLY
-masses/Makefile
-masses/README
-masses/README.perceptron
-masses/compare-models
-masses/config
-masses/config.set0
-masses/config.set1
-masses/config.set2
-masses/config.set3
-masses/corpora/README
-masses/corpora/fuzzy-hash-maildir
-masses/corpora/mass-find-nonspam
-masses/corpora/remove-tests-from-logs
-masses/corpora/uniq-mailbox
-masses/corpora/uniq-maildirs
-masses/corpora/mk-corpus-link-farm
-masses/cpucount
-masses/evolve_metarule/README
-masses/evolve_metarule/evolve_metarule.c
-masses/evolve_metarule/preproc.pl
-masses/extract-message-from-mbox
-masses/extract-results
-masses/find-extremes
-masses/fp-fn-statistics
-masses/fp-fn-to-tcr
-masses/freqdiff
-masses/generate-corpus
-masses/generate-translation
-masses/graphs/gen-score-freqs-gnuplot-table
-masses/graphs/gnuplot-score-graph
-masses/hit-frequencies
-masses/lint-rules-from-freqs
-masses/logdiff
-masses/logs-to-c
-masses/mass-check
-masses/mass-check.cf
-masses/mboxget
-masses/mk-baseline-results
-masses/mk-roc-graphs
-masses/model-statistics
-masses/overlap
-masses/parse-rules-for-masses
-masses/perceptron.c
-masses/plugins/01_rule_timing.cf
-masses/plugins/HitFreqsRuleTiming.pm
-masses/post-ga-analysis.pl
-masses/remove-ids-from-mclog
-masses/rewrite-cf-with-new-scores
-masses/rule-dev/maildir-scan-headers
-masses/rule-qa/README.nightly
-masses/rule-qa/corpus-hourly
-masses/rule-qa/corpus-nightly
-masses/rule-qa/corpus-tagtime
-masses/rule-qa/corpus.example
-masses/rule-qa/get-rulemetadata-for-revision
-masses/rule-qa/markup-rules-file-with-freqs
-masses/rule-qa/post-bugs-for-retired-tests
-masses/rule-qa/rule-hits-over-time
-masses/runGA
-masses/score-ranges-from-freqs
-masses/tenpass/10pass-compute-tcr
-masses/tenpass/10pass-run
-masses/tenpass/README
-masses/tenpass/compute-current-tcr
-masses/tenpass/split-log-into-buckets
-masses/tenpass/split-log-into-buckets-random
-masses/validate-model
 procmailrc.example
 sa-learn.raw
 sa-update.raw
@@ -455,17 +383,6 @@
 t/whitelist_subject.t
 t/whitelist_to.t
 t/zz_cleanup.t
-tools/README.speedtest
-tools/check_whitelist
-tools/convert_awl_dbm_to_sql
-tools/desc_length.pl
-tools/mboxsplit
-tools/sa-stats.pl
-tools/speedtest
-tools/split_corpora
-tools/sysreport
-tools/test_extract
-build/mkrules
 rules/10_default_prefs.cf
 rules/20_dnsbl_tests.cf
 rules/20_head_tests.cf

Modified: spamassassin/branches/jm_re2c_hacks/MANIFEST.SKIP
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/MANIFEST.SKIP?view=diff&rev=478926&r1=478925&r2=478926
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/MANIFEST.SKIP (original)
+++ spamassassin/branches/jm_re2c_hacks/MANIFEST.SKIP Fri Nov 24 09:29:40 2006
@@ -34,41 +34,10 @@
 ^debian/
 ^doc/
 ^lib/Mail/SpamAssassin/Plugin/NetCache\.pm$
+^lib/Mail/SpamAssassin/Util/MemoryDump\.pm$
 ^lm/
 ^made-doc-stamp$
 ^Mail-SpamAssassin-.*$
-^masses/analysis$
-^masses/badrules$
-^masses/bayes-testing/
-^masses/commands.sh$
-^masses/copy-logs-to-deimos$
-^masses/download-trapped-spam$
-^masses/dprof.nonspam$
-^masses/dprof.spam$
-^masses/evolve$
-^masses/freqs$
-^masses/galib245$
-^masses/goodresults$
-^masses/local-scripts-.*$
-^masses/logs$
-^masses/nonspam.dogma$
-^masses/nonspam.local$
-^masses/nonspam\..*$
-^masses/old-random-search$
-^masses/overnight.*$
-^masses/perceptron$
-^masses/pgapack$
-^masses/results?\..*$
-^masses/RUNME$
-^masses/spam.dogma$
-^masses/spam.local$
-^masses/spamtrap$
-^masses/spam\..*$
-^masses/start_evolving.sh$
-^masses/stop_evolving.sh$
-^masses/tmon.nonspam$
-^masses/tmon.spam$
-^masses/uniq-scores$
 ^pm_to_blib$
 ^pod2html?-?.*$
 ^presentation$
@@ -101,15 +70,13 @@
 ^todo$
 ^wordfreqs/
 ~$
-^masses/tmp/
-^masses/spamassassin/
+^masses/
+^contrib/
+^tools/
 ^pod2ref
-^masses/rule-qa/automc/
 ^sa-update$
 ^sa-compile$
 ^build/describe-to-po-file$
-^masses/enable-all-evolved-rules$
-^masses/evolve_metarule/Makefile$
 ^rules/70_sandbox.cf$
 ^build/automc/
 ^rulesrc/.*$

Modified: spamassassin/branches/jm_re2c_hacks/README
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/README?view=diff&rev=478926&r1=478925&r2=478926
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/README (original)
+++ spamassassin/branches/jm_re2c_hacks/README Fri Nov 24 09:29:40 2006
@@ -216,6 +216,22 @@
         '/usr/share/spamassassin/user_prefs.template'
 
 
+In addition, the "Distributed Configuration Files" location is overridden
+by a "Local State Directory", used to store an updated copy of the
+ruleset:
+
+  __prefix__    __local_state_dir__
+  -------------------------------------------------------------------------
+  /usr          /var/lib/spamassassin/__version__
+  /usr/local    /var/lib/spamassassin/__version__
+  /opt/$DIR     /var/opt/spamassassin/__version__
+  $DIR          $DIR/var/spamassassin/__version__
+
+This is normally written to by the "sa-update" script.  "__version__" is
+replaced by a representation of the version number, so that multiple
+versions of SpamAssassin will not interfere with each other's rulesets.
+
+
 After installation, try "perldoc Mail::SpamAssassin::Conf" to see what
 can be set. Common first-time tweaks include:
 

Modified: spamassassin/branches/jm_re2c_hacks/build/automc/etc-apache-local-conf-httpd.conf
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/build/automc/etc-apache-local-conf-httpd.conf?view=diff&rev=478926&r1=478925&r2=478926
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/build/automc/etc-apache-local-conf-httpd.conf (original)
+++ spamassassin/branches/jm_re2c_hacks/build/automc/etc-apache-local-conf-httpd.conf Fri Nov 24 09:29:40 2006
@@ -108,6 +108,8 @@
   ServerAdmin webmaster@spamassassin.org
   userdir disabled
 
+  Alias /ruleqa.css /home/automc/svn/spamassassin/masses/rule-qa/automc/ruleqa.css
+
   # debugging:
   # RewriteLogLevel 9
   # RewriteLog /var/apache2/logs/rewrite_log
@@ -120,6 +122,8 @@
 
   # index: straight through
   RewriteRule ^/$ /ruleqa.cgi [PT,L]
+
+  RewriteRule ^/ruleqa.css$ /ruleqa.css [PT,L]
 
   # satisfy local files
   RewriteCond /var/www/ruleqa.spamassassin.org/%{REQUEST_FILENAME} !-d

Modified: spamassassin/branches/jm_re2c_hacks/build/buildbot/master.cfg
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/build/buildbot/master.cfg?view=diff&rev=478926&r1=478925&r2=478926
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/build/buildbot/master.cfg (original)
+++ spamassassin/branches/jm_re2c_hacks/build/buildbot/master.cfg Fri Nov 24 09:29:40 2006
@@ -277,9 +277,9 @@
                         mode="problem",
                         sendToInterestedUsers=True))
 
-from buildbot.status import words
-c['status'].append(words.IRC(host="irc.us.freenode.net", nick="buildbot-bot",
-                             channels=["#spamassassin"]))
+# from buildbot.status import words
+# c['status'].append(words.IRC(host="irc.us.freenode.net", nick="buildbot-bot",
+                             # channels=["#spamassassin"]))
 
 # from buildbot.status import words
 # c['status'].append(words.IRC(host="irc.example.com", nick="bb",

Modified: spamassassin/branches/jm_re2c_hacks/build/mkupdates/run_part2
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/build/mkupdates/run_part2?view=diff&rev=478926&r1=478925&r2=478926
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/build/mkupdates/run_part2 (original)
+++ spamassassin/branches/jm_re2c_hacks/build/mkupdates/run_part2 Fri Nov 24 09:29:40 2006
@@ -81,22 +81,16 @@
 
 make install                             || exit $?
 
-presdir=$tmpdir/etc/mail/spamassassin
 rulesdir=$tmpdir/share/spamassassin
 
 (
   cd $rulesdir 
 
-  # include ".pre" files for the default distro plugins, like
-  # Mail::SpamAssassin::Plugin::BodyEval, Mail::SpamAssassin::Plugin::Bayes
-  # etc. (bug 5171)
-  cp -p $presdir/*.pre .
-
   # Use this to include plugin .pm files:
-  # tar cvf - *.cf *.pm *.pre            || exit $?
+  # tar cvf - *.cf *.pm                  || exit $?
 
   # or this, to ban code from the updates:
-  tar cvf - *.cf *.pre                   || exit $?
+  tar cvf - *.cf                         || exit $?
 
 ) | gzip -9 > $tmpdir/update.tgz         || exit $?
 

Modified: spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin.pm?view=diff&rev=478926&r1=478925&r2=478926
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin.pm (original)
+++ spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin.pm Fri Nov 24 09:29:40 2006
@@ -236,6 +236,12 @@
 If set to 1, no tests that require internet access will be performed. (default:
 0)
 
+=item ignore_site_cf_files
+
+If set to 1, any rule files found in the C<site_rules_filename> directory will
+be ignored.  *.pre files (used for loading plugins) found in the
+C<site_rules_filename> directory will still be used. (default: 0)
+
 =item dont_copy_prefs
 
 If set to 1, the user preferences file will not be created if it doesn't
@@ -1412,7 +1418,7 @@
       $self->{languages_filename} = $self->find_rule_support_file("languages");
     }
 
-    if ($siterules) {
+    if ($siterules && !$self->{ignore_site_cf_files}) {
       $self->{config_text} .= $self->read_cf($siterules, 'site rules dir');
     }
 

Modified: spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/ArchiveIterator.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/ArchiveIterator.pm?view=diff&rev=478926&r1=478925&r2=478926
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/ArchiveIterator.pm (original)
+++ spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/ArchiveIterator.pm Fri Nov 24 09:29:40 2006
@@ -106,6 +106,13 @@
 Only use the first N ham and N spam (or if the value is -N, only use the first
 N total messages regardless of class).
 
+This setting can be specified separately for ham and spam target classes.
+If multiple targets for one class are specified with different
+options, the last target's options will be used.
+
+If the value is negative, and multiple targets are specified with different
+options, the last spam target's setting will be used.
+
 =item opt_tail
 
 Only use the last N ham and N spam (or if the value is -N, only use the last
@@ -115,17 +122,36 @@
 specifies a subset of the C<opt_tail> selection to use; in other words, the
 C<opt_tail> splice is applied first.
 
+This setting can be specified separately for ham and spam target classes.
+If multiple targets for one class are specified with different
+options, the last target's options will be used.
+
+If the value is negative, and multiple targets are specified with different
+options, the last spam target's setting will be used.
+
+=item opt_scanprob
+
+Randomly select messages to scan, with a probability of N, where N ranges
+from 0.0 (no messages scanned) to 1.0 (all messages scanned).  Default
+is 1.0.
+
+This setting can be specified separately for each target.
+
 =item opt_before
 
 Only use messages which are received after the given time_t value.
 Negative values are an offset from the current time, e.g. -86400 =
 last 24 hours; or as parsed by Time::ParseDate (e.g. '-6 months')
 
+This setting can be specified separately for each target.
+
 =item opt_after
 
 Same as opt_before, except the messages are only used if after the given
 time_t value.
 
+This setting can be specified separately for each target.
+
 =item opt_want_date
 
 Set to 1 (default) if you want the received date to be filled in
@@ -136,7 +162,8 @@
 =item opt_cache
 
 Set to 0 (default) if you don't want to use cached information to help speed
-up ArchiveIterator.  Set to 1 to enable.
+up ArchiveIterator.  Set to 1 to enable.  This setting requires C<opt_cachedir>
+also be set.
 
 =item opt_cachedir
 
@@ -183,11 +210,6 @@
   if (!defined $self) { $self = { }; }
   bless ($self, $class);
 
-  $self->{opt_head} = 0 unless (defined $self->{opt_head});
-  $self->{opt_tail} = 0 unless (defined $self->{opt_tail});
-  $self->{opt_want_date} = 1 unless (defined $self->{opt_want_date});
-  $self->{opt_cache} = 0 unless (defined $self->{opt_cache});
-
   # If any of these options are set, we need to figure out the message's
   # receive date at scan time.  opt_n == 0, opt_after, opt_before
   $self->{determine_receive_date} = !$self->{opt_n} ||
@@ -226,8 +248,13 @@
 C<.bz2> will be properly uncompressed via call to C<gzip -dc> and C<bzip2 -dc>
 respectively.
 
-The target_paths array is expected to be one element per path in the following
-format: class:format:raw_location
+The target_paths array is expected to be either one element per path in the
+following format: C<class:format:raw_location>, or a hash reference containing
+key-value option pairs and a 'target' key with a value in that format.
+
+The key-value option pairs that can be used are: opt_head, opt_tail,
+opt_scanprob, opt_after, opt_before.  See the constructor method's
+documentation for more information on their effects.
 
 run() returns 0 if there was an error (can't open a file, etc,) and 1 if there
 were no errors.
@@ -419,15 +446,30 @@
 
 ############################################################################
 
+# TODO: this needs POD since mass-check uses it?
 sub message_array {
   my ($self, $targets) = @_;
 
+  my %class_opts = ();
+
   foreach my $target (@${targets}) {
     if (!defined $target) {
       warn "archive-iterator: invalid (undef) value in target list";
       next;
     }
 
+    my %opts = ();
+    if (ref $target eq 'HASH') {
+      # e.g. { target => $target, opt_foo => 1, opt_bar => 0.4 ... }
+      foreach my $k (keys %{$target}) {
+        next unless ($k =~ /^opt_/);
+        my $v = $target->{$k};
+        next unless defined $v;
+        $opts{$k} = $v;
+      }
+      $target = $target->{target};
+    }
+
     my ($class, $format, $rawloc) = split(/:/, $target, 3);
 
     # "class"
@@ -444,6 +486,15 @@
     # use ham by default, things like "spamassassin" can't specify the type
     $class = substr($class, 0, 1) || 'h';
 
+    # keep a copy of the most recent message-selection options for
+    # each class
+    $class_opts{$class} = \%opts;
+
+    foreach my $k (keys %opts) {
+      $self->{$k} = $opts{$k};
+    }
+    $self->set_default_message_selection_opts();
+
     my @locations = $self->fix_globs($rawloc);
 
     foreach my $location (@locations) {
@@ -488,20 +539,13 @@
     }
   }
 
+  $self->top_and_tail_messages($self->{h}, $class_opts{h});
+  $self->top_and_tail_messages($self->{s}, $class_opts{s});
+
   my $messages;
   if ($self->{opt_n}) {
     # OPT_N == 1 means don't bother sorting on message receive date
 
-    # head or tail > 0 means crop each list
-    if ($self->{opt_tail} > 0) {
-      splice(@{$self->{s}}, 0, -$self->{opt_tail});
-      splice(@{$self->{h}}, 0, -$self->{opt_tail});
-    }
-    if ($self->{opt_head} > 0) {
-      splice(@{$self->{s}}, min ($self->{opt_head}, scalar @{$self->{s}}));
-      splice(@{$self->{h}}, min ($self->{opt_head}, scalar @{$self->{h}}));
-    }
-
     # for ease of memory, we'll play with pointers
     $messages = $self->{s};
     undef $self->{s};
@@ -512,21 +556,11 @@
     # OPT_N == 0 means sort on message receive date
 
     # Sort the spam and ham groups by date
-    my @s = sort { $a cmp $b } @{$self->{s}};
+    my @s = @{$self->{s}};
     undef $self->{s};
-    my @h = sort { $a cmp $b } @{$self->{h}};
+    my @h = @{$self->{h}};
     undef $self->{h};
 
-    # head or tail > 0 means crop each list
-    if ($self->{opt_tail} > 0) {
-      splice(@s, 0, -$self->{opt_tail});
-      splice(@h, 0, -$self->{opt_tail});
-    }
-    if ($self->{opt_head} > 0) {
-      splice(@s, min ($self->{opt_head}, scalar @s));
-      splice(@h, min ($self->{opt_head}, scalar @h));
-    }
-
     # interleave ordered spam and ham
     if (@s && @h) {
       my $ratio = @s / @h;
@@ -569,6 +603,52 @@
   return 1;
 }
 
+sub set_default_message_selection_opts {
+  my ($self) = @_;
+  $self->{opt_head} = 0 unless (defined $self->{opt_head});
+  $self->{opt_tail} = 0 unless (defined $self->{opt_tail});
+  $self->{opt_scanprob} = 1.0 unless (defined $self->{opt_scanprob});
+  $self->{opt_want_date} = 1 unless (defined $self->{opt_want_date});
+  $self->{opt_cache} = 0 unless (defined $self->{opt_cache});
+}
+
+sub top_and_tail_messages {
+  my ($self, $ary, $opts) = @_;
+
+  foreach my $k (keys %{$opts}) {
+    $self->{$k} = $opts->{$k};
+  }
+  $self->set_default_message_selection_opts();
+
+  if ($self->{opt_n}) {
+    # OPT_N == 1 means don't bother sorting on message receive date
+
+    # head or tail > 0 means crop each list
+    if ($self->{opt_tail} > 0) {
+      splice(@{$ary}, 0, -$self->{opt_tail});
+    }
+    if ($self->{opt_head} > 0) {
+      splice(@{$ary}, min ($self->{opt_head}, scalar @{$ary}));
+    }
+  }
+  else {
+    # OPT_N == 0 means sort on message receive date
+
+    # Sort the spam and ham groups by date
+    my @s = sort { $a cmp $b } @{$ary};
+
+    # head or tail > 0 means crop each list
+    if ($self->{opt_tail} > 0) {
+      splice(@s, 0, -$self->{opt_tail});
+    }
+    if ($self->{opt_head} > 0) {
+      splice(@s, min ($self->{opt_head}, scalar @s));
+    }
+
+    @{$ary} = @s;
+  }
+}
+
 ############################################################################
 
 sub message_is_useful_by_date {
@@ -607,6 +687,16 @@
   }
 }
 
+sub scanprob_says_scan {
+  my ($self) = @_;
+  if (defined $self->{opt_scanprob} && $self->{opt_scanprob} < 1.0) {
+    if ( int( rand( 1 / $self->{opt_scanprob} ) ) != 0 ) {
+      return 0;
+    }
+  }
+  return 1;
+}
+
 ############################################################################
 
 # 0 850852128			atime
@@ -694,6 +784,7 @@
   }
 
   return if !$self->message_is_useful_by_date($date);
+  return if !$self->scanprob_says_scan();
   push(@{$self->{$class}}, index_pack($date, $class, "f", $mail));
 }
 
@@ -791,6 +882,7 @@
       if ($self->{determine_receive_date}) {
         next if !$self->message_is_useful_by_date($v);
       }
+      next if !$self->scanprob_says_scan();
 
       push(@{$self->{$class}}, index_pack($v, $class, "m", "$file.$k"));
     }
@@ -898,6 +990,7 @@
       if ($self->{determine_receive_date}) {
         next if !$self->message_is_useful_by_date($v);
       }
+      next if !$self->scanprob_says_scan();
 
       push(@{$self->{$class}}, index_pack($v, $class, "b", "$file.$k"));
     }

Modified: spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm?view=diff&rev=478926&r1=478925&r2=478926
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm (original)
+++ spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm Fri Nov 24 09:29:40 2006
@@ -34,6 +34,8 @@
 use warnings;
 use bytes;
 
+use re qw(regmust);     # added in blead, 2006-11-16
+
 use vars qw(@ISA);
 @ISA = qw(Mail::SpamAssassin::Plugin);
 
@@ -107,7 +109,7 @@
   # may be a good long string of text at the end of the rule.
 
   # require this many chars in a base string, for it to be viable
-  my $min_chars = 4;
+  my $min_chars = 3;
 
   foreach my $name (keys %{$rules}) {
     my $rule = $rules->{$name};
@@ -116,16 +118,31 @@
     # TODO: need cleaner way to do this
     next if ($conf->{rules_to_replace}->{$name});
 
-    my @bases1 = ();
-    my @bases2 = ();
-    eval {  # catch die()s
-      @bases1 = $self->extract_hints($rule, 0);
-    };
-    $@ and dbg("giving up on that direction: $@");
-    eval {
-      @bases2 = $self->extract_hints($rule, 1);
-    };
-    $@ and dbg("giving up on that direction: $@");
+    my ($qr, $mods) = $self->simplify_and_qr_regexp($rule);
+    my ($anchored, $floating) = regmust(qr/$qr/);
+    my @bases1 = (quotemeta $anchored);
+    my @bases2 = (quotemeta $floating);
+    # my @bases1 = ();
+    # my @bases2 = ();
+
+    my $len1 = 0;
+    my $len2 = 0;
+    if ($anchored) { $len1 = length($anchored); }
+    if ($floating) { $len2 = length($floating); }
+
+    # fall back to using our own code, since the regexp is too
+    # complex (probably alternations involved).
+    if ((!$anchored || $len1 < $min_chars) && (!$floating || $len2 < $min_chars))
+    {
+      eval {  # catch die()s
+        @bases1 = $self->extract_hints($qr, $mods, 0);
+      };
+      $@ and dbg("giving up on that direction: $@");
+      eval {  # catch die()s
+        @bases2 = $self->extract_hints($qr, $mods, 1);
+      };
+      $@ and dbg("giving up on that direction: $@");
+    }
 
     # if any of the extracted hints in a set are too short, the entire
     # set is invalid; this is because each set of N hints represents just
@@ -154,6 +171,7 @@
       # both are valid; use the end with the longer hints
       if ($minlen2 > $minlen1) {
         @bases1 = @bases2;
+        $minlen1 = $minlen2;
       }
     }
 
@@ -256,13 +274,11 @@
 # /time to refinance|refinanc\w{1,3}\b.{0,16}\bnow\b/i
 #     => should understand alternations; tricky
 
-sub extract_hints {
+sub simplify_and_qr_regexp {
   my $self = shift;
   my $rule = shift;
-  my $is_reversed = shift;
 
   my $main = $self->{main};
-  my $orig = $rule;
   $rule = Mail::SpamAssassin::Util::regexp_remove_delimiters($rule);
 
   # remove the regexp modifiers, keep for later
@@ -320,6 +336,17 @@
   # remove the "?=" trick
   # (?=[dehklnswxy])(horny|nasty|hot|wild|young|....etc...)
   $rule =~ s/\(\?\=\[[^\]]+\]\)//gs;
+  ($rule, $mods);
+}
+
+sub extract_hints {
+  my $self = shift;
+  my $rule = shift;
+  my $mods = shift;
+  my $is_reversed = shift;
+
+  my $main = $self->{main};
+  my $orig = $rule;
 
   # if there are anchors, give up; we can't get much 
   # faster than these anyway

Modified: spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/Check.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/Check.pm?view=diff&rev=478926&r1=478925&r2=478926
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/Check.pm (original)
+++ spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/Check.pm Fri Nov 24 09:29:40 2006
@@ -40,6 +40,8 @@
   return $self;
 }
 
+###########################################################################
+
 sub check_main {
   my ($self, $args) = @_;
 
@@ -68,14 +70,17 @@
     next unless ($pms->{conf}->{priorities}->{$priority} > 0);
 
     # if shortcircuiting is hit, we skip all other priorities...
-    last if $self->shortcircuited_p();
+    last if $self->{main}->call_plugins("have_shortcircuited", { permsgstatus => $pms });
 
     dbg("check: running tests for priority: $priority");
 
     # only harvest the dnsbl queries once priority HARVEST_DNSBL_PRIORITY
     # has been reached and then only run once
-    if ($priority >= HARVEST_DNSBL_PRIORITY && $needs_dnsbl_harvest_p
-	&& !$self->shortcircuited_p($pms)) {
+    if ($priority >= HARVEST_DNSBL_PRIORITY
+        && $needs_dnsbl_harvest_p
+        && !$self->{main}->call_plugins("have_shortcircuited",
+                                        { permsgstatus => $pms }))
+    {
       # harvest the DNS results
       $pms->harvest_dnsbl_queries();
       $needs_dnsbl_harvest_p = 0;
@@ -91,7 +96,7 @@
     $self->do_head_eval_tests($pms, $priority);
 
     $self->do_body_tests($pms, $priority, $decoded);
-    $self->do_body_uri_tests($pms, $priority, @uris);
+    $self->do_uri_tests($pms, $priority, @uris);
     $self->do_body_eval_tests($pms, $priority, $decoded);
   
     $self->do_rawbody_tests($pms, $priority, $bodytext);
@@ -110,7 +115,9 @@
   # sanity check, it is possible that no rules >= HARVEST_DNSBL_PRIORITY ran so the harvest
   # may not have run yet.  Check, and if so, go ahead and harvest here.
   if ($needs_dnsbl_harvest_p) {
-    if (!$self->shortcircuited_p($pms)) {
+    if (!$self->{main}->call_plugins("have_shortcircuited",
+                                        { permsgstatus => $pms }))
+    {
       # harvest the DNS results
       $pms->harvest_dnsbl_queries();
     }
@@ -143,6 +150,8 @@
   @TEMPORARY_METHODS = ();      # clear for next time
 }
 
+###########################################################################
+
 sub run_rbl_eval_tests {
   my ($self, $pms) = @_;
   my ($rulename, $pat, @args);
@@ -174,42 +183,125 @@
   }
 }
 
-sub do_meta_tests {
-  my ($self, $pms, $priority) = @_;
-  
-  # XXX - why not just make the plugin call?
-  return if $self->shortcircuited_p($pms);
+###########################################################################
 
-  dbg("rules: running meta tests; score so far=" . $pms->{score} );
-  my $conf = $pms->{conf};
+sub run_generic_tests {
+  my ($self, $pms, $priority, %opts) = @_;
+
+  return if $self->{main}->call_plugins("have_shortcircuited",
+                                        { permsgstatus => $pms });
 
-  my $doing_user_rules = 
-    $conf->{user_rules_to_compile}->{$Mail::SpamAssassin::Conf::TYPE_META_TESTS};
+  my $ruletype = $opts{type};
+  dbg("rules: running ".$ruletype." tests; score so far=".$pms->{score});
+  $pms->{test_log_msgs} = ();        # clear test state
+
+  my $conf = $pms->{conf};
+  my $doing_user_rules = $conf->{user_rules_to_compile}->{$opts{consttype}};
 
   # clean up priority value so it can be used in a subroutine name
   my $clean_priority;
   ($clean_priority = $priority) =~ s/-/neg/;
-
   my $package_name = __PACKAGE__;
+  my $methodname = $package_name."::_".$ruletype."_tests_".$clean_priority;
 
-  # speedup code provided by Matt Sergeant
-  if (defined &{"${package_name}::_meta_tests_${clean_priority}"}
-       && !$doing_user_rules) {
+  if (defined &{$methodname} && !$doing_user_rules) {
     no strict "refs";
-    &{"${package_name}::_meta_tests_${clean_priority}"}($pms);
+run_compiled_method:
+    $methodname->($pms, @{$opts{args}});
     use strict "refs";
     return;
   }
 
-  my (%rule_deps, %meta, $rulename);
-  my $evalstr = '';
+  # build up the eval string...
+  $self->{evalstr} = $self->start_rules_plugin_code($ruletype, $priority);
+  $self->{evalstr2} = '';
+
+  # use %nopts for named parameter-passing; it's more friendly to future-proof
+  # subclassing, since new parameters can be added without breaking third-party
+  # subclassed implementations of this plugin.
+  my %nopts = (
+    ruletype => $ruletype,
+    doing_user_rules => $doing_user_rules,
+    priority => $priority,
+    clean_priority => $clean_priority
+  );
+
+  if (defined $opts{pre_loop_body}) {
+    $opts{pre_loop_body}->($self, $pms, $conf, %nopts);
+  }
+  while (my($rulename, $test) = each %{$opts{testhash}->{$priority}}) {
+    $opts{loop_body}->($self, $pms, $conf, $rulename, $test, %nopts);
+  }
+  if (defined $opts{post_loop_body}) {
+    $opts{post_loop_body}->($self, $pms, $conf, %nopts);
+  }
+
+  # clear out a previous version of this fn
+  undef &{$methodname};
+  $self->free_ruleset_source($pms, $ruletype, $priority);
+
+  my $evalstr = $self->{evalstr};
+
+  # generate the loop that goes through each line...
+  $evalstr = <<"EOT";
+  {
+    package $package_name;
+
+    $self->{evalstr2}
+
+    sub $methodname {
+      my \$self = shift;
+      $evalstr;
+    }
+
+    1;
+  }
+EOT
+
+  delete $self->{evalstr};
+  delete $self->{evalstr2}; # free up some RAM before we eval()
+
+  ## dbg ("rules: eval code to compile: $evalstr");
+  eval $evalstr;
+  if ($@) {
+    warn("rules: failed to compile $ruletype tests, skipping:\n\t($@)\n");
+    $pms->{rule_errors}++;
+  }
+  else {
+    goto run_compiled_method;
+  }
+}
+
+sub add_evalstr {
+  my ($self, $str) = @_;
+  $self->{evalstr} .= $str;
+}
+
+sub add_evalstr2 {
+  my ($self, $str) = @_;
+  $self->{evalstr2} .= $str;
+}
+
+sub add_temporary_method {
+  my ($self, $methodname, $methodbody) = @_;
+  $self->add_evalstr2 (' sub '.$methodname.' { '.$methodbody.' } ');
+  push (@TEMPORARY_METHODS, $methodname);
+}
 
-  # Get the list of meta tests
-  my @metas = keys %{$conf->{meta_tests}->{$priority}};
+###########################################################################
+
+sub do_meta_tests {
+  my ($self, $pms, $priority) = @_;
+  my (%rule_deps, %meta, $rulename);
 
-  # Go through each rule and figure out what we need to do
-  foreach $rulename (@metas) {
-    my $rule = $conf->{meta_tests}->{$priority}->{$rulename};
+  $self->run_generic_tests ($pms, $priority,
+    consttype => $Mail::SpamAssassin::Conf::TYPE_META_TESTS,
+    type => 'meta',
+    testhash => $pms->{conf}->{meta_tests},
+    args => [ ],
+    loop_body => sub
+  {
+    my ($self, $pms, $conf, $rulename, $rule, %opts) = @_;
     my $token;
 
     # Lex the rule into tokens using a rather simple RE method ...
@@ -254,150 +346,97 @@
 
         # If the token is another meta rule, add it as a dependency
         push (@{ $rule_deps{$rulename} }, $token)
-          if (exists $conf->{meta_tests}->{$priority}->{$token});
+          if (exists $conf->{meta_tests}->{$opts{priority}}->{$token});
       }
     }
-  }
-
-  # Sort by length of dependencies list.  It's more likely we'll get
-  # the dependencies worked out this way.
-  @metas = sort { @{ $rule_deps{$a} } <=> @{ $rule_deps{$b} } } @metas;
-
-  my $count;
-  my $tflags = $conf->{tflags};
-
-  # Now go ahead and setup the eval string
-  do {
-    $count = $#metas;
-    my %metas = map { $_ => 1 } @metas; # keep a small cache for fast lookups
-
-    # Go through each meta rule we haven't done yet
-    for (my $i = 0 ; $i <= $#metas ; $i++) {
+  },
+    pre_loop_body => sub
+  {
+    my ($self, $pms, $conf, %opts) = @_;
+    $self->add_evalstr ('
+      my $r;
+      my $h = $self->{tests_already_hit};
+    ');
+  },
+    post_loop_body => sub
+  {
+    my ($self, $pms, $conf, %opts) = @_;
 
-      # If we depend on meta rules that haven't run yet, skip it
-      next if (grep( $metas{$_}, @{ $rule_deps{ $metas[$i] } }));
+    # Sort by length of dependencies list.  It's more likely we'll get
+    # the dependencies worked out this way.
+    my @metas = sort { @{ $rule_deps{$a} } <=> @{ $rule_deps{$b} } }
+                keys %{$conf->{meta_tests}->{$opts{priority}}};
+
+    my $count;
+    my $tflags = $conf->{tflags};
+
+    # Now go ahead and setup the eval string
+    do {
+      $count = $#metas;
+      my %metas = map { $_ => 1 } @metas; # keep a small cache for fast lookups
+
+      # Go through each meta rule we haven't done yet
+      for (my $i = 0 ; $i <= $#metas ; $i++) {
+
+        # If we depend on meta rules that haven't run yet, skip it
+        next if (grep( $metas{$_}, @{ $rule_deps{ $metas[$i] } }));
+
+        # If we depend on network tests, call ensure_rules_are_complete()
+        # to block until they are
+        my $alldeps = join ' ', grep {
+                ($tflags->{$_}||'') =~ /\bnet\b/
+              } split (' ', $conf->{meta_dependencies}->{ $metas[$i] } );
+
+        if ($alldeps ne '') {
+          $self->add_evalstr ('
+            $self->ensure_rules_are_complete(q{'.$metas[$i].'}, qw{'.$alldeps.'});
+          ');
+        }
 
-      # If we depend on network tests, call ensure_rules_are_complete()
-      # to block until they are
-      my $alldeps = join ' ', grep {
-              ($tflags->{$_}||'') =~ /\bnet\b/
-            } split (' ', $conf->{meta_dependencies}->{ $metas[$i] } );
+        # Add this meta rule to the eval line
+        $self->add_evalstr ('
+          $r = '.$meta{$metas[$i]}.';
+          if ($r) { $self->got_hit(q#'.$metas[$i].'#, "", ruletype => "meta", value => $r); }
+        ');
 
-      if ($alldeps ne '') {
-        $evalstr .= '  $pms->ensure_rules_are_complete(q{'.$metas[$i].'}, qw{'.$alldeps.'});';
+        splice @metas, $i--, 1;    # remove this rule from our list
       }
+    } while ($#metas != $count && $#metas > -1); # run until we can't go anymore
 
-      # Add this meta rule to the eval line
-      $evalstr .= '
-        $r = '.$meta{$metas[$i]}.';
-        if ($r) { $pms->got_hit(q#'.$metas[$i].'#, "", ruletype => "meta", value => $r); }
-      ';
-
-      splice @metas, $i--, 1;    # remove this rule from our list
-    }
-  } while ($#metas != $count && $#metas > -1); # run until we can't go anymore
-
-  # If there are any rules left, we can't solve the dependencies so complain
-  my %metas = map { $_ => 1 } @metas; # keep a small cache for fast lookups
-  foreach $rulename (@metas) {
-    $pms->{rule_errors}++; # flag to --lint that there was an error ...
-    my $msg =
-	"rules: excluding meta test $rulename, unsolved meta dependencies: " .
-	    join(", ", grep($metas{$_}, @{ $rule_deps{$rulename} }));
-    if ($self->{main}->{lint_rules}) {
-      warn $msg."\n";
-    }
-    else {
-      info($msg);
+    # If there are any rules left, we can't solve the dependencies so complain
+    my %metas = map { $_ => 1 } @metas; # keep a small cache for fast lookups
+    foreach $rulename (@metas) {
+      $pms->{rule_errors}++; # flag to --lint that there was an error ...
+      my $msg =
+          "rules: excluding meta test $rulename, unsolved meta dependencies: " .
+              join(", ", grep($metas{$_}, @{ $rule_deps{$rulename} }));
+      if ($self->{main}->{lint_rules}) {
+        warn $msg."\n";
+      }
+      else {
+        info($msg);
+      }
     }
   }
-
-  no strict "subs";
-  undef &{"${package_name}::_meta_tests_${clean_priority}"};
-  use strict "subs";
-  $self->free_ruleset_source($pms, 'meta', $priority);
-
-  return unless ($evalstr);
-
-  # setup the environment for meta tests
-  $evalstr = <<"EOT";
-{
-    package $package_name;
-
-    sub _meta_tests_$clean_priority {
-        # note: cannot set \$^W here on perl 5.6.1 at least, it
-        # crashes meta tests.
-
-        my (\$pms) = \@_;
-	my \$r;
-
-        my \$h = \$pms->{tests_already_hit};
-
-        $evalstr;
-    }
-
-    1;
+  );
 }
-EOT
 
-  eval $evalstr;
-
-  if ($@) {
-    warn "rules: failed to run meta tests, skipping some: $@\n";
-    $pms->{rule_errors}++;
-  }
-  else {
-    my $method = "${package_name}::_meta_tests_${clean_priority}";
-    push @TEMPORARY_METHODS, $method;
-    no strict "refs";
-    &{$method}($pms);
-    use strict "refs";
-  }
-}    # do_meta_tests()
+###########################################################################
 
 sub do_head_tests {
   my ($self, $pms, $priority) = @_;
-  local ($_);
-
-  # XXX - why not just do the plugin call?
-  return if $self->shortcircuited_p($pms);
-
-  # note: we do this only once for all head pattern tests.  Only
-  # eval tests need to use stuff in here.
-  $pms->{test_log_msgs} = ();        # clear test state
-
-  dbg("rules: running header regexp tests; score so far=".$pms->{score});
-
-  my $doing_user_rules = 
-    $pms->{conf}->{user_rules_to_compile}->{$Mail::SpamAssassin::Conf::TYPE_HEAD_TESTS};
-
-  # clean up priority value so it can be used in a subroutine name
-  my $clean_priority;
-  ($clean_priority = $priority) =~ s/-/neg/;
-
-  my $package_name = __PACKAGE__;
-
-  # speedup code provided by Matt Sergeant
-  if (defined &{"${package_name}::_head_tests_${clean_priority}"}
-      && !$doing_user_rules) {
-    no strict "refs";
-    &{"${package_name}::_head_tests_${clean_priority}"}($pms);
-    use strict "refs";
-    return;
-  }
-
-  my $conf = $pms->{conf};
-  my $tflags = $conf->{tflags};
-  my $use_rule_subs = $self->{main}->{use_rule_subs};
-
-  my $evalstr = $self->start_rules_plugin_code("header", $priority);
-  my $evalstr2 = '';
-
   # hash to hold the rules, "header\tdefault value" => rulename
   my %ordered = ();
   my %testcode = ();
 
-  while (my($rulename, $rule) = each %{$conf->{head_tests}->{$priority}}) {
+  $self->run_generic_tests ($pms, $priority,
+    consttype => $Mail::SpamAssassin::Conf::TYPE_HEAD_TESTS,
+    type => 'head',
+    testhash => $pms->{conf}->{head_tests},
+    args => [ ],
+    loop_body => sub
+  {
+    my ($self, $pms, $conf, $rulename, $rule, %opts) = @_;
     my $def = '';
     my ($hdrname, $testtype, $pat) =
         $rule =~ /^\s*(\S+)\s*(\=|\!)\~\s*(\S.*?\S)\s*$/;
@@ -415,58 +454,68 @@
 
     push(@{$ordered{"$hdrname\t$def"}}, $rulename);
 
-    if ($doing_user_rules) {
-      next if (!$self->is_user_rule_sub ($rulename.'_head_test'));
-    }
+    next if ($opts{doing_user_rules} &&
+            !$self->is_user_rule_sub($rulename.'_head_test'));
 
-    if ($use_rule_subs) {
-      $evalstr2 .= '
-        sub '.$rulename.'_head_test {
+    # caller can set this member of the Mail::SpamAssassin object to
+    # override this; useful for profiling rule runtimes, although I think
+    # the HitFreqsRuleTiming.pm plugin is probably better nowadays anyway
+      if ($self->{main}->{use_rule_subs}) {
+      $self->add_temporary_method ($rulename.'_head_test', '{
           my($self,$text) = @_;
           '.$self->hash_line_for_rule($pms, $rulename).'
 	    while ($text '.$testtype.'~ '.$pat.'g) {
             $self->got_hit(q#'.$rulename.'#, "", ruletype => "header");
             '. $self->hit_rule_plugin_code($pms, $rulename, "header", "last") . '
             }
-        }
-      ';
-      push (@TEMPORARY_METHODS, $rulename.'_head_test');
+        }');
     }
     else {
       # store for use below
       $testcode{$rulename} = $testtype.'~ '.$pat;
     }
-  }
+  },
+    pre_loop_body => sub
+  {
+    my ($self, $pms, $conf, %opts) = @_;
+    $self->add_evalstr ('
+      my $hval;
+    ');
+  },
+    post_loop_body => sub
+  {
+    my ($self, $pms, $conf, %opts) = @_;
+    # setup the function to run the rules
+    while(my($k,$v) = each %ordered) {
+      my($hdrname, $def) = split(/\t/, $k, 2);
+      $self->add_evalstr ('
+        $hval = $self->get(q#'.$hdrname.'#, q#'.$def.'#);
+      ');
+      foreach my $rulename (@{$v}) {
+        if ($self->{main}->{use_rule_subs}) {
+          $self->add_evalstr ('
+            if ($scoresptr->{q#'.$rulename.'#}) {
+              '.$rulename.'_head_test($self, $hval);
+              '.$self->ran_rule_plugin_code($rulename, "header").'
+            }
+          ');
+        }
+        else {
+          my $testcode = $testcode{$rulename};
 
-  # setup the function to run the rules
-  while(my($k,$v) = each %ordered) {
-    my($hdrname, $def) = split(/\t/, $k, 2);
-    $evalstr .= ' $hval = $self->get(q#'.$hdrname.'#, q#'.$def.'#);';
-    foreach my $rulename (@{$v}) {
-      if ($use_rule_subs) {
-        $evalstr .= '
-          if ($scoresptr->{q#'.$rulename.'#}) {
-             '.$rulename.'_head_test($self, $hval);
-             '.$self->ran_rule_plugin_code($rulename, "header").'
+          my $posline = '';
+          my $ifwhile = 'if';
+          my $hitdone = '';
+          my $matchg = '';
+          if (($conf->{tflags}->{$rulename}||'') =~ /\bmultiple\b/)
+          {
+            $posline = 'pos $hval = 0;';
+            $ifwhile = 'while';
+            $hitdone = 'last';
+            $matchg = 'g';
           }
-        ';
-      }
-      else {
-        my $testcode = $testcode{$rulename};
-
-        my $posline = '';
-        my $ifwhile = 'if';
-        my $hitdone = '';
-        my $matchg = '';
-        if (($tflags->{$rulename}||'') =~ /\bmultiple\b/)
-        {
-          $posline = 'pos $hval = 0;';
-          $ifwhile = 'while';
-          $hitdone = 'last';
-          $matchg = 'g';
-        }
 
-        $evalstr .= '
+          $self->add_evalstr ('
           if ($scoresptr->{q#'.$rulename.'#}) {
             '.$posline.'
             '.$self->hash_line_for_rule($pms, $rulename).'
@@ -476,105 +525,34 @@
             }
             '.$self->ran_rule_plugin_code($rulename, "header").'
           }
-        ';
+          ');
+        }
       }
     }
   }
-
-  # clear out a previous version of this fn, if already defined
-  no strict "subs";
-  undef &{"${package_name}::_head_tests_${clean_priority}"};
-  use strict "subs";
-  $self->free_ruleset_source($pms, 'head', $priority);
-
-  return unless ($evalstr);
-
-  $evalstr = <<"EOT";
-{
-    package $package_name;
-
-    $evalstr2
-
-    sub _head_tests_$clean_priority {
-        my (\$self) = \@_;
-        my \$hval;
-
-        $evalstr;
-    }
-
-    1;
+  );
 }
-EOT
 
-  eval $evalstr;
-
-  if ($@) {
-    warn "rules: failed to run header tests, skipping some: $@\n";
-    $pms->{rule_errors}++;
-  }
-  else {
-    my $method = "${package_name}::_head_tests_${clean_priority}";
-    push @TEMPORARY_METHODS, $method;
-    no strict "refs";
-    &{$method}($pms);
-    use strict "refs";
-  }
-}
+###########################################################################
 
 sub do_body_tests {
   my ($self, $pms, $priority, $textary) = @_;
-
-  # XXX - why not just make the plugin call directly?
-  return if $self->shortcircuited_p($pms);
-
-  dbg("rules: running body-text per-line regexp tests; score so far=".$pms->{score});
-
-  my $conf = $self->{conf};
-  my $doing_user_rules = 
-    $conf->{user_rules_to_compile}->{$Mail::SpamAssassin::Conf::TYPE_BODY_TESTS};
-
-  # clean up priority value so it can be used in a subroutine name
-  my $clean_priority;
-  ($clean_priority = $priority) =~ s/-/neg/;
-
-  my $package_name = __PACKAGE__;
-
-  $pms->{test_log_msgs} = ();        # clear test state
-
-  if (defined &{"${package_name}::_body_tests_${clean_priority}"}
-       && !$doing_user_rules) {
-    no strict "refs";
-    &{"${package_name}::_body_tests_${clean_priority}"}($pms, @$textary);
-    use strict "refs";
-    return;
-  }
-
-  # caller can set this member of the Mail::SpamAssassin object to
-  # override this; useful for profiling rule runtimes, although I think
-  # the HitFreqsRuleTiming.pm plugin is probably better nowadays anyway
-  my $use_rule_subs = $self->{main}->{use_rule_subs};
-
-  # build up the eval string...
-  my $evalstr = $self->start_rules_plugin_code("body", $priority);
-  my $evalstr2 = '';
   my $loopid = 0;
 
-  $evalstr .= '
-
-        $self->{main}->call_plugins("run_body_hack", {
-                permsgstatus => $self, ruletype => "body",
-                priority => '.$priority.', lines => \@_
-              });
-
-  ';
-
-  while (my($rulename, $pat) = each %{$pms->{conf}->{body_tests}->{$priority}}) {
+  $self->run_generic_tests ($pms, $priority,
+    consttype => $Mail::SpamAssassin::Conf::TYPE_BODY_TESTS,
+    type => 'body',
+    testhash => $pms->{conf}->{body_tests},
+    args => [ @$textary ],
+    loop_body => sub
+  {
+    my ($self, $pms, $conf, $rulename, $pat, %opts) = @_;
     my $sub;
     my $sub_one_line;
 
     my $need_one_line = ($pms->{conf}->{generate_body_one_line_sub}->{$rulename});
 
-    if (($pms->{conf}->{tflags}->{$rulename}||'') =~ /\bmultiple\b/)
+    if (($conf->{tflags}->{$rulename}||'') =~ /\bmultiple\b/)
     {
       # support multiple matches
       $loopid++;
@@ -584,7 +562,7 @@
         '.$self->hash_line_for_rule($pms, $rulename).'
         while ($l =~ '.$pat.'g) { 
           $self->got_hit(q{'.$rulename.'}, "BODY: ", ruletype => "body"); 
-          '. $self->hit_rule_plugin_code($pms, $rulename, "body",
+          '. $self->hit_rule_plugin_code($pms, $rulename, 'body',
 					 "last body_".$loopid) . '
         }
       }
@@ -630,117 +608,65 @@
 
     }
 
-    if ($use_rule_subs) {
-      $evalstr .= '
+    if ($self->{main}->{use_rule_subs}) {
+      $self->add_evalstr ('
         if ($scoresptr->{q{'.$rulename.'}}) {
           '.$rulename.'_body_test($self,@_); 
           '.$self->ran_rule_plugin_code($rulename, "body").'
         }
-      ';
+      ');
     }
     else {
-      $evalstr .= '
+      $self->add_evalstr ('
         if ($scoresptr->{q{'.$rulename.'}}) {
           '.$sub.'
           '.$self->ran_rule_plugin_code($rulename, "body").'
         }
-      ';
+      ');
     }
 
-    if ($doing_user_rules) {
-      next if (!$self->is_user_rule_sub ($rulename.'_body_test'));
-    }
+    next if ($opts{doing_user_rules} &&
+            !$self->is_user_rule_sub($rulename.'_body_test'));
 
-    if ($use_rule_subs) {
-      $evalstr2 .= '
-        sub '.$rulename.'_body_test { my $self = shift; '.$sub.' }
-      ';
-      push (@TEMPORARY_METHODS, $rulename.'_body_test');
+    if ($self->{main}->{use_rule_subs}) {
+      $self->add_temporary_method ($rulename.'_body_test',
+        '{ my $self = shift; '.$sub.' }');
     }
 
     if ($need_one_line) {
-      $evalstr2 .= '
-        sub '.$rulename.'_one_line_body_test { '.$sub_one_line.' }
-      ';
-      push (@TEMPORARY_METHODS, $rulename.'_one_line_body_test');
+      $self->add_temporary_method ($rulename.'_one_line_body_test',
+        '{ my $self = shift; '.$sub_one_line.' }');
     }
-
-  }
-
-  # clear out a previous version of this fn
-  undef &{"${package_name}::_body_tests_${clean_priority}"};
-  $self->free_ruleset_source($pms, 'body', $priority);
-
-  return unless ($evalstr);
-
-  # generate the loop that goes through each line...
-  $evalstr = <<"EOT";
-{
-  package $package_name;
-
-  $evalstr2
-
-  sub _body_tests_$clean_priority {
-    my \$self = shift;
-
-    $evalstr;
   }
+    pre_loop_body => sub
+  {
+    my ($self, $pms, $conf, %opts) = @_;
+    $self->add_evalstr ('
+ 
+      $self->{main}->call_plugins("run_body_hack", {
+              permsgstatus => $self, ruletype => "body",
+              priority => '.$opts{priority}.', lines => \@_
+            });
 
-  1;
+    ');
+  });
 }
-EOT
 
-  eval $evalstr;
-
-  if ($@) {
-    warn("rules: failed to compile body tests, skipping:\n" . "\t($@)\n");
-    $pms->{rule_errors}++;
-  }
-  else {
-    my $method = "${package_name}::_body_tests_${clean_priority}";
-    no strict "refs";
-    &{$method}($pms, @$textary);
-    use strict "refs";
-  }
-}
+###########################################################################
 
-sub do_body_uri_tests {
+sub do_uri_tests {
   my ($self, $pms, $priority, @uris) = @_;
-
-  # XXX - why not just do the direct plugin call?
-  return if $self->shortcircuited_p($pms);
-
-  dbg("uri: running uri tests; score so far=".$pms->{score});
-
-  my $doing_user_rules = 
-    $pms->{conf}->{user_rules_to_compile}->{$Mail::SpamAssassin::Conf::TYPE_URI_TESTS};
-
-  # clean up priority value so it can be used in a subroutine name
-  my $clean_priority;
-  ($clean_priority = $priority) =~ s/-/neg/;
-
-  my $package_name = __PACKAGE__;
-
-  $pms->{test_log_msgs} = ();        # clear test state
-
-  if (defined &{"${package_name}::_body_uri_tests_${clean_priority}"}
-      && !$doing_user_rules) {
-    no strict "refs";
-    &{"${package_name}::_body_uri_tests_${clean_priority}"}($pms, @uris);
-    use strict "refs";
-    return;
-  }
-
-  my $use_rule_subs = $self->{main}->{use_rule_subs};
-
-  # otherwise build up the eval string...
-  my $evalstr = $self->start_rules_plugin_code("uri", $priority);
-  my $evalstr2 = '';
   my $loopid = 0;
-
-  while (my($rulename, $pat) = each %{$pms->{conf}{uri_tests}->{$priority}}) {
+  $self->run_generic_tests ($pms, $priority,
+    consttype => $Mail::SpamAssassin::Conf::TYPE_URI_TESTS,
+    type => 'uri',
+    testhash => $pms->{conf}->{uri_tests},
+    args => [ @uris ],
+    loop_body => sub
+  {
+    my ($self, $pms, $conf, $rulename, $pat, %opts) = @_;
     my $sub;
-    if (($pms->{conf}->{tflags}->{$rulename}||'') =~ /\bmultiple\b/) {
+    if (($conf->{tflags}->{$rulename}||'') =~ /\bmultiple\b/) {
       $loopid++;
       $sub = '
       uri_'.$loopid.': foreach my $l (@_) {
@@ -765,109 +691,47 @@
       ';
     }
 
-    if ($use_rule_subs) {
-      # XXX - why isn't it _body_uri_test??
-      $evalstr .= '
+    if ($self->{main}->{use_rule_subs}) {
+      $self->add_evalstr ('
         if ($scoresptr->{q{'.$rulename.'}}) {
           '.$rulename.'_uri_test($self, @_);
           '.$self->ran_rule_plugin_code($rulename, "uri").'
         }
-      ';
+      ');
     }
     else {
-      $evalstr .= '
+      $self->add_evalstr ('
         if ($scoresptr->{q{'.$rulename.'}}) {
           '.$sub.'
           '.$self->ran_rule_plugin_code($rulename, "uri").'
         }
-      ';
+      ');
     }
 
-    if ($doing_user_rules) {
-      next if (!$self->is_user_rule_sub($rulename.'_uri_test'));
-    }
+    next if ($opts{doing_user_rules} &&
+            !$self->is_user_rule_sub($rulename.'_uri_test'));
 
-    if ($use_rule_subs) {
-      # XXX - why isn't it _body_uri_test??
-      $evalstr2 .= '
-        sub '.$rulename.'_uri_test { my $self = shift; '.$sub.' }
-      ';
-      push (@TEMPORARY_METHODS, $rulename.'_uri_test');
+    if ($self->{main}->{use_rule_subs}) {
+      $self->add_temporary_method ($rulename.'_uri_test',
+        '{ my $self = shift; '.$sub.' }');
     }
   }
-
-  # clear out a previous version of this fn
-  undef &{"${package_name}::_body_uri_tests_${clean_priority}"};
-  $self->free_ruleset_source($pms, 'uri', $priority);
-
-  return unless ($evalstr);
-
-  # generate the loop that goes through each line...
-  $evalstr = <<"EOT";
-{
-  package $package_name;
-
-  $evalstr2
-
-  sub _body_uri_tests_$clean_priority {
-    my \$self = shift;
-    $evalstr;
-  }
-
-  1;
+  );
 }
-EOT
 
-  eval $evalstr;
-
-  if ($@) {
-    warn("rules: failed to compile URI tests, skipping:\n" . "\t($@)\n");
-    $pms->{rule_errors}++;
-  }
-  else {
-    my $method = "${package_name}::_body_uri_tests_${clean_priority}";
-    push @TEMPORARY_METHODS, $method;
-    no strict "refs";
-    &{$method}($pms, @uris);
-    use strict "refs";
-  }
-}
+###########################################################################
 
 sub do_rawbody_tests {
   my ($self, $pms, $priority, $textary) = @_;
-
-  # XXX - why not just do the plugin call here??
-  return if $self->shortcircuited_p($pms);
-
-  dbg("rules: running raw-body-text per-line regexp tests; score so far=".$pms->{score});
-
-  my $doing_user_rules = 
-    $pms->{conf}->{user_rules_to_compile}->{$Mail::SpamAssassin::Conf::TYPE_RAWBODY_TESTS};
-
-  # clean up priority value so it can be used in a subroutine name
-  my $clean_priority;
-  ($clean_priority = $priority) =~ s/-/neg/;
-
-  my $package_name = __PACKAGE__;
-
-  $pms->{test_log_msgs} = ();        # clear test state
-  dbg("rules: in middle of raw-body-text");
-  if (defined &{"${package_name}::_rawbody_tests_${clean_priority}"}
-      && !$doing_user_rules) {
-    no strict "refs";
-    &{"${package_name}::_rawbody_tests_${clean_priority}"}($pms, @$textary);
-    use strict "refs";
-    return;
-  }
-
-  my $use_rule_subs = $self->{main}->{use_rule_subs};
-
-  # build up the eval string...
-  my $evalstr = $self->start_rules_plugin_code("rawbody", $priority);
-  my $evalstr2 = '';
   my $loopid = 0;
-
-  while (my($rulename, $pat) = each %{$pms->{conf}{rawbody_tests}->{$priority}}) {
+  $self->run_generic_tests ($pms, $priority,
+    consttype => $Mail::SpamAssassin::Conf::TYPE_RAWBODY_TESTS,
+    type => 'rawbody',
+    testhash => $pms->{conf}->{rawbody_tests},
+    args => [ @$textary ],
+    loop_body => sub
+  {
+    my ($self, $pms, $conf, $rulename, $pat, %opts) = @_;
     my $sub;
     if (($pms->{conf}->{tflags}->{$rulename}||'') =~ /\bmultiple\b/)
     {
@@ -897,104 +761,55 @@
       ';
     }
 
-    if ($use_rule_subs) {
-      $evalstr .= '
+    if ($self->{main}->{use_rule_subs}) {
+      $self->add_evalstr ('
         if ($scoresptr->{q{'.$rulename.'}}) {
            '.$rulename.'_rawbody_test($self, @_);
            '.$self->ran_rule_plugin_code($rulename, "rawbody").'
         }
-      ';
+      ');
     }
     else {
-      $evalstr .= '
+      $self->add_evalstr ('
         if ($scoresptr->{q{'.$rulename.'}}) {
           '.$sub.'
           '.$self->ran_rule_plugin_code($rulename, "rawbody").'
         }
-      ';
+      ');
     }
 
-    if ($doing_user_rules) {
-      next if (!$self->is_user_rule_sub($rulename.'_rawbody_test'));
-    }
+    next if ($opts{doing_user_rules} &&
+            !$self->is_user_rule_sub($rulename.'_rawbody_test'));
 
-    if ($use_rule_subs) {
-      $evalstr2 .= '
-        sub '.$rulename.'_rawbody_test { my $self = shift; '.$sub.' }
-      ';
-      push (@TEMPORARY_METHODS, $rulename.'_rawbody_test');
+    if ($self->{main}->{use_rule_subs}) {
+      $self->add_temporary_method ($rulename.'_rawbody_test',
+        '{ my $self = shift; '.$sub.' }');
     }
   }
-
-  # clear out a previous version of this fn
-  undef &{"${package_name}::_rawbody_tests_${clean_priority}"};
-  $self->free_ruleset_source($pms, 'rawbody', $priority);
-
-  return unless ($evalstr);
-
-  # generate the loop that goes through each line...
-  $evalstr = <<"EOT";
-{
-  package $package_name;
-
-  $evalstr2
-
-  sub _rawbody_tests_$clean_priority {
-    my \$self = shift;
-    $evalstr;
-  }
-
-  1;
+  );
 }
-EOT
 
-  eval $evalstr;
-
-  if ($@) {
-    warn("rules: failed to compile body tests, skipping:\n" . "\t($@)\n");
-    $pms->{rule_errors}++;
-  }
-  else {
-    my $method = "${package_name}::_rawbody_tests_${clean_priority}";
-    push @TEMPORARY_METHODS, $method;
-    no strict "refs";
-    &{$method}($pms, @$textary);
-    use strict "refs";
-  }
-}
+###########################################################################
 
 sub do_full_tests {
   my ($self, $pms, $priority, $fullmsgref) = @_;
-
-  # XXX - why not just do the plugin call directly?
-  return if $self->shortcircuited_p($pms);
-
-  dbg("rules: running full-text regexp tests; score so far=".$pms->{score});
-
-  my $doing_user_rules = 
-    $pms->{conf}->{user_rules_to_compile}->{$Mail::SpamAssassin::Conf::TYPE_FULL_TESTS};
-
-  # clean up priority value so it can be used in a subroutine name
-  my $clean_priority;
-  ($clean_priority = $priority) =~ s/-/neg/;
-
-  my $package_name = __PACKAGE__;
-
-  $pms->{test_log_msgs} = ();        # clear test state
-
-  if (defined &{"${package_name}::_full_tests_${clean_priority}"}
-      && !$doing_user_rules) {
-    no strict "refs";
-    &{"${package_name}::_full_tests_${clean_priority}"}($pms, $fullmsgref);
-    use strict "refs";
-    return;
-  }
-
-  # build up the eval string...
-  my $evalstr = $self->start_rules_plugin_code("full", $priority);
-
-  while (my($rulename, $pat) = each %{$pms->{conf}{full_tests}->{$priority}}) {
-    $evalstr .= '
+  my $loopid = 0;
+  $self->run_generic_tests ($pms, $priority,
+    consttype => $Mail::SpamAssassin::Conf::TYPE_FULL_TESTS,
+    type => 'full',
+    testhash => $pms->{conf}->{full_tests},
+    args => [ $fullmsgref ],
+    pre_loop_body => sub
+  {
+    my ($self, $pms, $conf, %opts) = @_;
+    $self->add_evalstr ('
+      my $fullmsgref = shift;
+    ');
+  },
+                loop_body => sub
+  {
+    my ($self, $pms, $conf, $rulename, $pat, %opts) = @_;
+    $self->add_evalstr ('
       if ($scoresptr->{q{'.$rulename.'}}) {
         pos $$fullmsgref = 0;
         '.$self->hash_line_for_rule($pms, $rulename).'
@@ -1004,43 +819,13 @@
         }
         '.$self->ran_rule_plugin_code($rulename, "full").'
       }
-    ';
-  }
-
-  undef &{"${package_name}::_full_tests_${clean_priority}"};
-  $self->free_ruleset_source($pms, 'full', $priority);
-
-  return unless ($evalstr);
-
-  # and compile it.
-  $evalstr = <<"EOT";
-  {
-    package $package_name;
-
-    sub _full_tests_$clean_priority {
-        my (\$self, \$fullmsgref) = \@_;
-        study \$\$fullmsgref;
-        $evalstr
-    }
-
-    1;
-  }
-EOT
-
-  eval $evalstr;
-
-  if ($@) {
-    warn "rules: failed to compile full tests, skipping:\n" . "\t($@)\n";
-    $pms->{rule_errors}++;
-  } else {
-    my $method = "${package_name}::_full_tests_${clean_priority}";
-    push @TEMPORARY_METHODS, $method;
-    no strict "refs";
-    &{$method}($pms, $fullmsgref);
-    use strict "refs";
+    ');
   }
+  );
 }
 
+###########################################################################
+
 sub do_head_eval_tests {
   my ($self, $pms, $priority) = @_;
   return unless (defined($pms->{conf}->{head_evals}->{$priority}));
@@ -1075,8 +860,8 @@
 sub run_eval_tests {
   my ($self, $pms, $testtype, $evalhash, $prepend2desc, $priority, @extraevalargs) = @_;
   
-  # XXX - why not just call the plugin directly?
-  return if $self->shortcircuited_p($pms);
+  return if $self->{main}->call_plugins("have_shortcircuited",
+                                        { permsgstatus => $pms });
 
   my $conf = $pms->{conf};
   my $doing_user_rules = $conf->{user_rules_to_compile}->{$testtype};
@@ -1097,7 +882,8 @@
   # Some of the rules are scoreset specific, so we need additional
   # subroutines to handle those
   if (defined &{"${package_name}::${methodname}"}
-      && !$doing_user_rules) {
+      && !$doing_user_rules)
+  {
     no strict "refs";
     &{"${package_name}::${methodname}"}($pms,@extraevalargs);
     use strict "refs";
@@ -1246,15 +1032,9 @@
   }
 }
 
+###########################################################################
 # Helper Functions
 
-# NOTE: don't call this have_shortcircuited since it creates a nasty recursion loop
-sub shortcircuited_p {
-  my ($self, $pms) = @_;
-  return 1 if $self->{main}->call_plugins("have_shortcircuited", { permsgstatus => $pms
-								 });
-}
-
 sub hash_line_for_rule {
   my ($self, $pms, $rulename) = @_;
   return "\n".'#line 1 "'.
@@ -1274,7 +1054,7 @@
 
   my $evalstr = '
 
-      # start_rules_plugin_code '.$ruletype.'
+      # start_rules_plugin_code '.$ruletype.' '.$pri.'
       my $scoresptr = $self->{conf}->{scores};
 
   ';
@@ -1350,5 +1130,7 @@
     delete $pms->{conf}->{$type.'_tests'}->{$pri};
   }
 }
+
+###########################################################################
 
 1;

Modified: spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/P595Body.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/P595Body.pm?view=diff&rev=478926&r1=478925&r2=478926
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/P595Body.pm (original)
+++ spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/P595Body.pm Fri Nov 24 09:29:40 2006
@@ -68,15 +68,20 @@
   my ($self, $conf, $test_set, $ruletype) = @_;
   foreach my $pri (keys %{$test_set}) {
     my $nicepri = $pri; $nicepri =~ s/-/neg/g;
-    $self->setup_test_set_pri($conf, $test_set->{$pri}, $ruletype.'_'.$nicepri);
+    $self->setup_test_set_pri($conf, $test_set->{$pri}, $ruletype.'_'.$nicepri, $pri);
   }
 }
 
 sub setup_test_set_pri {
-  my ($self, $conf, $rules, $ruletype) = @_;
+  my ($self, $conf, $rules, $ruletype, $pri) = @_;
 
   my $alternates = [];
   my $trie_rules = {};
+
+  # while (my ($rule, $pat) = each %{$pms->{conf}->{body_tests}->{$priority}}) {
+  # push @{$alternates}, $pat;
+  # }
+
   foreach my $base (keys %{$conf->{base_string}->{$ruletype}})
   {
     push @{$alternates}, $base;

Modified: spamassassin/branches/jm_re2c_hacks/masses/Makefile
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/masses/Makefile?view=diff&rev=478926&r1=478925&r2=478926
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/masses/Makefile (original)
+++ spamassassin/branches/jm_re2c_hacks/masses/Makefile Fri Nov 24 09:29:40 2006
@@ -18,8 +18,8 @@
 perceptron.o: tmp/rules.pl tmp/tests.h tmp/scores.h
 	$(CC) $(CFLAGS) -c -o perceptron.o perceptron.c
 
-tmp/rules.pl: tmp/.created parse-rules-for-masses
-	perl parse-rules-for-masses -d $(RULES) -s $(SCORESET)
+tmp/rules.pl: tmp/.created ../build/parse-rules-for-masses
+	perl ../build/parse-rules-for-masses -d $(RULES) -s $(SCORESET)
 
 tmp/tests.h: tmp/.created tmp/ranges.data logs-to-c
 	perl logs-to-c --cffile=$(RULES) --scoreset=$(SCORESET)

Modified: spamassassin/branches/jm_re2c_hacks/masses/find-extremes
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/masses/find-extremes?view=diff&rev=478926&r1=478925&r2=478926
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/masses/find-extremes (original)
+++ spamassassin/branches/jm_re2c_hacks/masses/find-extremes Fri Nov 24 09:29:40 2006
@@ -348,7 +348,7 @@
 
 
 sub readscores {
-  system ("./parse-rules-for-masses") and
+  system ("../build/parse-rules-for-masses") and
    die "Couldn't do parse-rules-for-masses: $?; stopped";
   require "./tmp/rules.pl";
 }

Modified: spamassassin/branches/jm_re2c_hacks/masses/generate-translation
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/masses/generate-translation?view=diff&rev=478926&r1=478925&r2=478926
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/masses/generate-translation (original)
+++ spamassassin/branches/jm_re2c_hacks/masses/generate-translation Fri Nov 24 09:29:40 2006
@@ -82,7 +82,7 @@
 sub read_rules {
   my ($cffile) = @_;
 
-  system("$FindBin::Bin/parse-rules-for-masses -d \"$cffile\"")
+  system("$FindBin::Bin/../build/parse-rules-for-masses -d \"$cffile\"")
       and die "unable to parse rules\n";
   require "$FindBin::Bin/tmp/rules.pl"
       or die "unable to read tmp/rules.pl\n";

Modified: spamassassin/branches/jm_re2c_hacks/masses/hit-frequencies
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/masses/hit-frequencies?view=diff&rev=478926&r1=478925&r2=478926
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/masses/hit-frequencies (original)
+++ spamassassin/branches/jm_re2c_hacks/masses/hit-frequencies Fri Nov 24 09:29:40 2006
@@ -786,7 +786,7 @@
 
 sub readscores {
   my($cffile) = @_;
-  if (system ("$FindBin::Bin/parse-rules-for-masses -d \"$cffile\" -s $opt_s")) {
+  if (system ("$FindBin::Bin/../build/parse-rules-for-masses -d \"$cffile\" -s $opt_s")) {
     warn "parse-rules-for-masses failed!";
   }
   eval {

Modified: spamassassin/branches/jm_re2c_hacks/masses/logs-to-c
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/masses/logs-to-c?view=diff&rev=478926&r1=478925&r2=478926
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/masses/logs-to-c (original)
+++ spamassassin/branches/jm_re2c_hacks/masses/logs-to-c Fri Nov 24 09:29:40 2006
@@ -200,7 +200,7 @@
 
 sub readscores {
   print "Reading scores from \"$opt_cffile\"...\n";
-  system ("./parse-rules-for-masses -d \"$opt_cffile\" -s $opt_scoreset") and die;
+  system ("../build/parse-rules-for-masses -d \"$opt_cffile\" -s $opt_scoreset") and die;
   require "./tmp/rules.pl";
   %allrules = %rules;           # ensure it stays global
 }

Modified: spamassassin/branches/jm_re2c_hacks/masses/mass-check
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/masses/mass-check?view=diff&rev=478926&r1=478925&r2=478926
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/masses/mass-check (original)
+++ spamassassin/branches/jm_re2c_hacks/masses/mass-check Fri Nov 24 09:29:40 2006
@@ -46,6 +46,7 @@
                 were encapsulated by servers matching the regexp RE
                 (default = extract all SpamAssassin-encapsulated mails)
   --lint        check rules for syntax before running
+  --cf='config line'  Additional line of configuration
 
   client/server mode options
   --server host:port
@@ -78,11 +79,16 @@
   -n            no date sorting or spam/ham interleaving
   --cache	use cache information when selecting messages
   --cachedir=dir write cache info for --cache in this directory tree
+  --all         don't skip big messages
+
+  message selection options, can be specified for each target
   --after=N     only test mails received after time_t N (negative values
                 are an offset from current time, e.g. -86400 = last day)
                 or after date as parsed by Time::ParseDate (e.g. '-6 months')
   --before=N    same as --after, except received times are before time_t N
-  --all         don't skip big messages
+  --scanprob=N  probability of scanning a message, range 0.0 - 1.0 (default: 1.0)
+
+  message selection options, can be specified for each target class
   --head=N      only check first N ham and N spam (N messages if -n used)
   --tail=N      only check last N ham and N spam (N messages if -n used)
 
@@ -117,8 +123,8 @@
 	    $opt_mid $opt_net $opt_nosort $opt_progress $opt_showdots
 	    $opt_spamlog $opt_tail $opt_rules $opt_restart $opt_loguris
 	    $opt_logmem $opt_after $opt_before $opt_rewrite $opt_deencap
-	    $opt_learn $opt_reuse $opt_lint $opt_cache $opt_noisy
-	    $total_messages $statusevery $opt_cachedir
+	    $opt_learn $opt_reuse $opt_lint $opt_cache $opt_noisy $opt_cf
+	    $total_messages $statusevery $opt_cachedir $opt_scanprob
 	    $opt_client $opt_cs_max $opt_cs_timeout $opt_cs_paths_only
 	    $opt_server %postdata %real $svn_revision
 	    $tmpfd %reuse %orig_conf %reuse_conf $reuse_rules_loaded_p);
@@ -148,16 +154,20 @@
 $opt_spamlog = "spam.log";
 $opt_learn = 0;
 $reuse_rules_loaded_p = 0;
+$opt_cf = [];
 
 my @ORIG_ARGV = @ARGV;
 GetOptions("c=s", "p=s", "f=s", "j=i", "n", "o", "all", "bayes", "debug:s",
 	   "hamlog=s", "head=i", "loghits", "mh", "mid", "ms", "net",
 	   "progress", "rewrite:s", "showdots", "spamlog=s", "tail=i",
-	   "rules=s", "restart=i", "after=s", "before=s", "loguris",
+	   "rules=s", "restart=i", "loguris",
 	   "deencap=s", "logmem", "learn=i", "reuse", "lint", "cache",
-           "cachedir=s", "noisy",
+           "cachedir=s", "noisy", "scanprob=f",
 	   "server=s", "cs_max=i", "cs_timeout=i", "cs_paths_only",
 	   "client=s",
+	   "before=s" => \&deal_with_before_after,
+	   "after=s" => \&deal_with_before_after,
+           'cf=s' => \@{$opt_cf},
 	   "dir" => sub { $opt_format = "dir"; },
 	   "file" => sub { $opt_format = "file"; },
 	   "mbox" => sub { $opt_format = "mbox"; },
@@ -177,7 +187,7 @@
     # some people specify paths relatively, whereas this needs an absolute path,
     # so "do the right thing"(tm).
     my $abs_opt_c = File::Spec->rel2abs($opt_c);
-    system("cd $FindBin::Bin; perl parse-rules-for-masses -d $abs_opt_c");
+    system("cd $FindBin::Bin; perl ../build/parse-rules-for-masses -d $abs_opt_c");
   }
 
   require $rules_path;
@@ -207,6 +217,7 @@
     'local_tests_only'   			=> $opt_net ? 0 : 1,
     'only_these_rules'   			=> $opt_rules,
     'ignore_safety_expire_timeout'		=> 1,
+    'post_config_text'                          => join("\n", @{$opt_cf})."\n",
     PREFIX					=> '',
     DEF_RULES_DIR        			=> $opt_c,
     LOCAL_RULES_DIR      			=> '',
@@ -241,6 +252,7 @@
   'local_tests_only'   			=> $opt_net ? 0 : 1,
   'only_these_rules'   			=> $opt_rules,
   'ignore_safety_expire_timeout'	=> 1,
+  'post_config_text'                    => join("\n", @{$opt_cf})."\n",
   PREFIX				=> '',
   DEF_RULES_DIR        			=> $opt_c,
   LOCAL_RULES_DIR      			=> '',
@@ -308,34 +320,21 @@
     open(REWRITE, "> $rewrite") || die "open of $rewrite failed: $!";
   }
 
-  # Deal with --before and --after
-  foreach my $time ($opt_before, $opt_after) {
-    if ($time && $time =~ /^-\d+$/) {
-      $time = time + $time;
-    }
-    elsif ($time && $time !~ /^-?\d+$/) {
-      if (HAS_TIME_PARSEDATE) {
-        $time = Time::ParseDate::parsedate($time, GMT => 1, PREFER_PAST => 1);
-      }
-      else { 
-        die "You need Time::ParseDate if you use either the --before or --after option.";
-      }
-    }
-  }
-
-  if ($opt_before && $opt_after && $opt_after >= $opt_before) {
-    die "--before ($opt_before) <= --after ($opt_after) -- conflict!";
-  }
-
   # ArchiveIterator options for non-client mode
   $AIopts->{'opt_n'} = $opt_n;
   $AIopts->{'opt_head'} = $opt_head;
   $AIopts->{'opt_tail'} = $opt_tail;
+  $AIopts->{'opt_scanprob'} = $opt_scanprob;
   $AIopts->{'opt_cache'} = $opt_cache;
   $AIopts->{'opt_cachedir'} = $opt_cachedir;
   $AIopts->{'opt_after'} = $opt_after;
   $AIopts->{'opt_before'} = $opt_before;
   $AIopts->{'scan_progress_sub'} = \&showdots_blip;
+
+  # ensure that scanprob stuff is predictable and reproducable
+  if (defined $opt_scanprob && $opt_scanprob < 1.0) {
+    srand(1);
+  }
 }
 else {
   # ArchiveIterator options for client mode -- tends to be simple
@@ -447,12 +446,23 @@
 
 sub target  {
   my ($target) = @_;
+
+  # message-selection options; these can now be specified separately
+  # for each target
+  my %selopts = (
+    opt_head => $opt_head,
+    opt_tail => $opt_tail,
+    opt_scanprob => $opt_scanprob,
+    opt_after => $opt_after,
+    opt_before => $opt_before
+  );
+
   if (!defined($opt_format)) {
-    push(@targets, $target);
+    push(@targets, { %selopts, target => $target });
   }
   else {
     $opt_o = 1;
-    push(@targets, "spam:$opt_format:$target");
+    push(@targets, { %selopts, target => "spam:$opt_format:$target" });
   }
 }
 
@@ -736,6 +746,8 @@
     }
   }
 
+# use Mail::SpamAssassin::Util::MemoryDump; Mail::SpamAssassin::Util::MemoryDump::MEMDEBUG(); use Mail::SpamAssassin::Util::MemoryDump; Mail::SpamAssassin::Util::MemoryDump::MEMDEBUG_dump_obj($status); #JMD
+
   if (defined $status) { $status->finish(); }
   $ma->finish();
   undef $ma;		# clean 'em up
@@ -1828,5 +1840,32 @@
 sub aidbg {
   if (would_log("dbg", "mass-check") == 2) {
     dbg (@_);
+  }
+}
+
+sub deal_with_before_after {
+  my($which, $time) = @_;
+
+  if ($time && $time =~ /^-\d+$/) {
+    $time = time + $time;
+  }
+  elsif ($time && $time !~ /^-?\d+$/) {
+    if (HAS_TIME_PARSEDATE) {
+      $time = Time::ParseDate::parsedate($time, GMT => 1, PREFER_PAST => 1);
+    }
+    else { 
+      die "You need Time::ParseDate if you use either the --before or --after option.";
+    }
+  }
+  
+  if ($which eq 'before') {
+    $opt_before = $time;
+  }
+  else {
+    $opt_after = $time;
+  }
+
+  if ($opt_before && $opt_after && $opt_after >= $opt_before) {
+    die "--before ($opt_before) <= --after ($opt_after) -- conflict!";
   }
 }

Modified: spamassassin/branches/jm_re2c_hacks/masses/mk-roc-graphs
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/masses/mk-roc-graphs?view=diff&rev=478926&r1=478925&r2=478926
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/masses/mk-roc-graphs (original)
+++ spamassassin/branches/jm_re2c_hacks/masses/mk-roc-graphs Fri Nov 24 09:29:40 2006
@@ -122,7 +122,7 @@
 
 sub readscores {
   warn "Reading scores from \"$opt_cffile\"...\n";
-  system ("./parse-rules-for-masses -d \"$opt_cffile\" -s $opt_scoreset") and die;
+  system ("../build/parse-rules-for-masses -d \"$opt_cffile\" -s $opt_scoreset") and die;
   require "./tmp/rules.pl";
   %allrules = %rules;           # ensure it stays global
 }

Modified: spamassassin/branches/jm_re2c_hacks/masses/rewrite-cf-with-new-scores
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/masses/rewrite-cf-with-new-scores?view=diff&rev=478926&r1=478925&r2=478926
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/masses/rewrite-cf-with-new-scores (original)
+++ spamassassin/branches/jm_re2c_hacks/masses/rewrite-cf-with-new-scores Fri Nov 24 09:29:40 2006
@@ -81,7 +81,7 @@
 
 
 sub read_rules {
-  system ("./parse-rules-for-masses -s $scoreset") and die;
+  system ("../build/parse-rules-for-masses -s $scoreset") and die;
   if (-e "tmp/rules.pl") {
     # note: the spaces need to stay in front of the require to work around
     # a RPM 4.1 problem