You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2006/09/29 20:21:11 UTC

svn commit: r451377 - in /spamassassin/branches/jm_re2c_hacks: lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm lib/Mail/SpamAssassin/Plugin/Rule2XSBody.pm rule2xs/re2xs rules/rule2xs.pre

Author: jm
Date: Fri Sep 29 11:21:10 2006
New Revision: 451377

URL: http://svn.apache.org/viewvc?view=rev&rev=451377
Log:
separate 'base' extraction into a separate plugin, since it can work with other systems as well as rule2xs

Added:
    spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm
Modified:
    spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/Rule2XSBody.pm
    spamassassin/branches/jm_re2c_hacks/rule2xs/re2xs
    spamassassin/branches/jm_re2c_hacks/rules/rule2xs.pre

Added: spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm?view=auto&rev=451377
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm (added)
+++ spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm Fri Sep 29 11:21:10 2006
@@ -0,0 +1,505 @@
+# <@LICENSE>
+# Copyright 2006 Apache Software Foundation
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </...@LICENSE>
+
+=head1 NAME
+
+Mail::SpamAssassin::Plugin::BodyRuleBaseExtractor - extract "bases" from body ruleset
+
+=head1 SYNOPSIS
+
+This is a work-in-progress plugin to extract "base" strings from SpamAssassin
+'body' rules, suitable for use in Rule2XsBody rules.
+
+=cut
+
+package Mail::SpamAssassin::Plugin::BodyRuleBaseExtractor;
+
+use Mail::SpamAssassin::Plugin;
+use Mail::SpamAssassin::Logger;
+
+use strict;
+use warnings;
+use bytes;
+
+use vars qw(@ISA);
+@ISA = qw(Mail::SpamAssassin::Plugin);
+
+# a few settings that control what kind of bases are output:
+
+# treat all rules as lowercase for purposes of term extraction?
+my $BASES_MUST_BE_CASE_I = 1;
+my $BASES_CAN_USE_ALTERNATIONS = 0;    # /(foo|bar|baz)/
+my $BASES_CAN_USE_QUANTIFIERS = 0;     # /foo.*bar/ or /foo*bar/ or /foooo?bar/
+my $BASES_CAN_USE_CHAR_CLASSES = 0;    # /fo[opqr]bar/
+
+sub new {
+  my $class = shift;
+  my $mailsaobject = shift;
+  $class = ref($class) || $class;
+  my $self = $class->SUPER::new($mailsaobject);
+  bless ($self, $class);
+
+  return $self;
+}
+
+###########################################################################
+
+sub finish_parsing_end {
+  my ($self, $params) = @_;
+  my $conf = $params->{conf};
+
+  # TODO: need a better way to do this rather than using an env
+  # var as a back channel
+  my $rawf = $ENV{'RULE_REGEXP_DUMP_FILE'};
+  return unless $rawf;
+
+  $rawf =~ /^(.*)$/;
+  my $f = $1;       # untaint; allow anything here, it's from %ENV and safe
+
+  $self->extract_bases_for_set ($f, $conf, $conf->{body_tests}, 'body');
+}
+
+sub extract_bases_for_set {
+  my ($self, $dumpfile, $conf, $test_set, $ruletype) = @_;
+
+  foreach my $pri (keys %{$test_set}) {
+    my $nicepri = $pri; $nicepri =~ s/-/neg/g;
+    $self->extract_all($dumpfile, $conf, $test_set->{$pri}, $ruletype.'_'.$nicepri);
+  }
+}
+
+###########################################################################
+
+sub extract_all {
+  my ($self, $dumpfile, $conf, $rules, $ruletype) = @_;
+
+  my @good_bases = ();
+  my @failed = ();
+  my $yes = 0;
+  my $no = 0;
+
+  dbg("zoom: base extraction start for type $ruletype");
+
+  # attempt to find good "base strings" (simplified regexp subsets) for each
+  # regexp.  We try looking at the regexp from both ends, since there
+  # may be a good long string of text at the end of the rule.
+
+  # require this many chars in a base string, for it to be viable
+  my $min_chars = 4;
+
+  foreach my $name (keys %{$rules}) {
+    my $rule = $rules->{$name};
+
+    # ignore ReplaceTags rules
+    next if ($conf->{rules_to_replace}->{$name});
+
+    my $base  = $self->extract_base($rule, 0);
+    my $base2 = $self->extract_base($rule, 1);
+
+    my $len   = $base  ? $self->count_regexp_statements($base) : 0;
+    my $len2  = $base2 ? $self->count_regexp_statements($base2) : 0;
+
+    if ($base2 && (!$base || ($len2 > $len))) {
+      $base = $base2;
+      $len = $len2;
+    }
+
+    if (!$base || $len < $min_chars) { $base = undef; }
+
+    if ($base) {
+      # dbg("zoom: YES <base>$base</base> <origrule>$rule</origrule>");
+      push @good_bases, { base => $base, orig => $rule, name => $name };
+      $yes++;
+    }
+    else {
+      dbg("zoom: NO $rule");
+      push @failed, { orig => $rule };
+      $no++;
+    }
+  }
+
+  # NOTE: re2c will attempt to provide the longest pattern that matched; e.g.
+  # ("food" =~ "foo" / "food") will return "food".  So therefore if a pattern
+  # subsumes other patterns, we need to return hits for all of them.  We also
+  # need to take care of the case where multiple regexps wind up sharing the
+  # same base.   
+  #
+  # Another gotcha, an exception to the subsumption rule; if one pattern isn't
+  # entirely subsumed (e.g. "food" =~ "foo" / "ood"), then they will be
+  # returned as two hits, correctly.  So we only have to be smart about the
+  # full-subsumption case; overlapping is taken care of for us, by re2c.
+  #
+  # TODO: there's a bug here.  Since the code in extract_base() has been
+  # modified to support more complex regexps, we can no longer simply assume
+  # that if pattern A is not contained in pattern B, that means that pattern B
+  # doesn't subsume it.  Consider, for example, A="foo*bar" and
+  # B="morefobarry"; A is indeed subsumed by B, but we won't be able to test
+  # that without running the A RE match itself somehow against B.
+  # same issue remains with:
+  #
+  #   "foo?bar" / "fobar"
+  #   "fo(?:o|oo|)bar" / "fobar"
+  #   "fo(?:o|oo)?bar" / "fobar"
+  #   "fo(?:o*|baz)bar" / "fobar"
+  #   "(?:fo(?:o*|baz)bar|blargh)" / "fobar"
+  #
+  # it's worse with this:
+  #
+  #   "fo(?:o|oo|)bar" / "foo*bar"
+  #
+  # basically, this is impossible to compute without reimplementing most of
+  # re2c, and it appears the re2c developers don't plan to offer this:
+  # https://sourceforge.net/tracker/index.php?func=detail&aid=1540845&group_id=96864&atid=616203
+
+  open (OUT, ">$dumpfile") or die "cannot write to $dumpfile!";
+  print OUT "name $ruletype\n";
+
+  foreach my $set1 (@good_bases) {
+    my $base1 = $set1->{base};
+    my $orig1 = $set1->{orig};
+    my $key1  = $set1->{name};
+    next if ($base1 eq '' or $key1 eq '');
+
+    print OUT "orig $key1 $orig1\n";
+
+    foreach my $set2 (@good_bases) {
+      next if ($set1 == $set2);
+      next if ($set1->{name} =~ /\b\Q$set2->{name}\E\b/);
+      next if ($set2->{name} =~ /\b\Q$set1->{name}\E\b/);
+
+      my $base2 = $set2->{base};
+      next if ($base2 eq '');
+      next if (length $base1 < length $base2);
+      next if ($base1 !~ /\Q$base2\E/);
+
+      $set1->{name} .= " ".$set2->{name};
+
+      if ($base1 eq $base2) {
+        # an exact duplicate!  kill the latter entirely
+        $set2->{name} = '';
+        $set2->{base} = '';
+      }
+      # otherwise, base2 is just a subset of base1
+
+      # dbg("zoom: subsuming '$base2' into '$base1': $set1->{name}");
+    }
+  }
+
+  foreach my $set (@good_bases) {
+    my $base = $set->{base};
+    my $key  = $set->{name};
+    next unless $base;
+    print OUT "r $base:$key\n";
+  }
+  close OUT or die "close failed on $dumpfile!";
+
+  # TODO: run re2xs automatically here
+
+  warn ("zoom: base extraction complete for $ruletype: yes=$yes no=$no\n");
+}
+
+###########################################################################
+
+# TODO:
+# NO /no.{1,10}P(?:er|re)scription.{1,10}(?:needed|require|necessary)/i
+#     => should extract 'scription' somehow
+# /time to refinance|refinanc\w{1,3}\b.{0,16}\bnow\b/i
+#     => should understand alternations; tricky
+
+sub extract_base {
+  my $self = shift;
+  my $rule = shift;
+  my $is_reversed = shift;
+
+  my $orig = $rule;
+  $rule = Mail::SpamAssassin::Util::regexp_remove_delimiters($rule);
+
+  # remove the regexp modifiers, keep for later
+  my $mods = '';
+  while ($rule =~ s/^\(\?([a-z]*)\)//) { $mods .= $1; }
+
+  # modifier removal
+  while ($rule =~ s/^\(\?-([a-z]*)\)//) {
+    foreach my $modchar (split '', $mods) {
+      $mods =~ s/$modchar//g;
+    }
+  }
+
+  # now: simplify aspects of the regexp.  Bear in mind that we can
+  # simplify as long as we cause the regexp to become more general;
+  # more hits is OK, since false positives will be discarded afterwards
+  # anyway.  Simplification that causes the regexp to *not* hit
+  # stuff that the "real" rule would hit, however, is a bad thing.
+
+  if ($BASES_MUST_BE_CASE_I) {
+    $rule = lc $rule;
+    $mods =~ s/i//;
+
+    # always case-i: /A(?i:ct) N(?i:ow)/ => /Act Now/
+    $rule =~ s/(?<!\\)\(\?i\:(.*?)\)/$1/gs;
+
+    # always case-i: /A(?-i:ct)/ => /Act/
+    $rule =~ s/(?<!\\)\(\?-i\:(.*?)\)/$1/gs;
+
+    # remove (?i)
+    $rule =~ s/\(\?i\)//gs;
+  }
+
+  # remove /m and /s modifiers
+  $mods =~ s/m//;
+  $mods =~ s/s//;
+
+  # remove (^|\b)'s
+  # T_KAM_STOCKTIP23 /(EXTREME INNOVATIONS|(^|\b)EXTI($|\b))/is
+  $rule =~ s/\(\^\|\\b\)//gs;
+  $rule =~ s/\(\$\|\\b\)//gs;
+  $rule =~ s/\(\\b\|\^\)//gs;
+  $rule =~ s/\(\\b\|\$\)//gs;
+
+  # remove \b's
+  $rule =~ s/\\b//gs;
+
+  # remove the "?=" trick
+  # (?=[dehklnswxy])(horny|nasty|hot|wild|young|....etc...)
+  $rule =~ s/\(\?\=\[[^\]]+\]\)//gs;
+
+  # if there are anchors, give up; we can't get much 
+  # faster than these anyway
+  return if $rule =~ /^\(?(?:\^|\\A)/;
+  return if $rule =~ /(?:\$|\\Z)\)?$/;
+
+  # simplify (?:..) to (..)
+  $rule =~ s/\(\?:/\(/g;
+
+  # here's the trick; we can use the truncate regexp below simply by
+  # reversing the string and taking care to fix "\z" 2-char escapes.
+  # TODO: this breaks stuff like "\s+" or "\S{4,12}", but since the
+  # truncation regexp below is pretty simple-minded, that's ok.
+  if ($is_reversed) {
+    $rule = join ('', reverse (split '', $rule));
+    $rule = de_reverse_multi_char_regexp_statements($rule);
+  }
+
+  # truncate the pattern at the first unhandleable metacharacter
+  # or range
+  $rule =~ s/(?<!\\)(?:
+              \(\?\!|
+              \\[abce-rt-vx-z]|
+              \\[ABCE-RT-VX-Z]
+            ).*$//gsx;
+
+  $BASES_CAN_USE_CHAR_CLASSES or $rule =~ s/(?<!\\)(?:
+              \\\w|
+              \.|
+              \[|
+              \]
+            ).*$//gsx;
+
+  $BASES_CAN_USE_QUANTIFIERS or $rule =~ s/(?<!\\)(?:
+              .\*|	# remove the quantified char, too
+              .\+|
+              .\?|
+              .\{
+            ).*$//gsx;
+
+  $BASES_CAN_USE_ALTERNATIONS or $rule =~ s/(?<!\\)(?:
+              \(|
+              \)
+            ).*$//gsx;
+
+  if ($is_reversed) {
+    $rule = join ('', reverse (split '', $rule));
+    $rule = de_reverse_multi_char_regexp_statements($rule);
+  }
+
+  # drop this one, after the reversing
+  $rule =~ s/\(\?\!.*$//gsx;
+
+  # still problematic; kill all "x?" statements
+  $rule =~ s/.\?.*$//gsx;
+
+  # simplify (..)? and (..|) to (..|z{0})
+  # this wierd construct is to work around an re2c bug; (..|) doesn't
+  # do what it should
+  if ($BASES_CAN_USE_ALTERNATIONS) {
+    $rule =~ s/\((.*?)\)\?/\($1\|z{0}\)/gs;
+    $rule =~ s/\((.*?)\|\)/\($1\|z{0}\)/gs;
+    $rule =~ s/\(\|(.*?)\)/\($1\|z{0}\)/gs;
+  }
+
+  # re2xs doesn't like escaped brackets;
+  # brackets in general, in fact
+  $rule =~ s/\:.*//g;
+
+  # replace \s, \d, \S with char classes that (nearly) match them
+  # TODO: \w, \W need to know about utf-8, ugh
+
+  # [a-f\s]
+  $rule =~ s/(\[[^\]]*)\\s([^\]]*\])/$1 \\t\\n$2/gs;
+  # [a-f\S]: we can't support this, cut the string here
+  $rule =~ s/(\[[^\]]*)\\S([^\]]*\]).*//gs;
+  $rule =~ s/(\[[^\]]*)\\d([^\]]*\])/${1}0-9$2/gs;
+  $rule =~ s/(\[[^\]]*)\\D([^\]]*\]).*//gs;
+  $rule =~ s/(\[[^\]]*)\\w([^\]]*\])/${1}a-z0-9$2/gs;
+  $rule =~ s/(\[[^\]]*)\\W([^\]]*\]).*//gs;
+
+  # \s, etc. outside of existing char class blocks
+  $rule =~ s/\\S/[^ \\t\\n]/gs;
+  $rule =~ s/\\s/[ \\t\\n]/gs;
+  $rule =~ s/\\S/[^ \\t\\n]/gs;
+  $rule =~ s/\\d/[0-9]/gs;
+  $rule =~ s/\\D/[^0-9]/gs;
+  $rule =~ s/\\w/[_a-z0-9]/gs;
+  $rule =~ s/\\W/[^_a-z0-9]/gs;
+
+  # loop here, to catch __DRUGS_SLEEP1:
+  # 0,3}([ \t\n]|z{0})
+  while (1) 
+  {
+    my $startrule = $rule;
+
+    # exit early if the pattern starts with a class in a group;
+    # we can't reliably kill these
+    # r ([a-z0-9]+\*[,[ \t\n]]+){2}:TVD_BODY_END_STAR
+    if ($rule =~ /^\((?:
+              \.?[\*\?\+] |
+              \.?\{?[^\{]*\} |
+              [^\(]*\) |
+              \[ |
+              [^\[]*\]
+            )/sx)
+    {
+      return;
+    }
+
+    # kill quantifiers right at the start of the string.
+    # this (a) reduces algorithmic complexity of the produced code,
+    # and (b) can also improve overall speed as a side-effect of (a)
+    $rule =~ s/^(?:
+              \.?[\*\?\+] |
+              \.?\{?[^\{]*\} |
+              [^\(]*\) |
+              \[?[^\[]*\]
+            )+//gsx;
+
+    # kill quantifiers right at the end of the string, too;
+    # they can hide hits if they overlap with other patterns
+    0 and $rule =~ s/(?:
+              \.[\*\?\+] |
+              \.\{?[^\{]*\} |
+              \. |
+              \([^\)]* |
+              \[[^\[]*\]?
+            )+$//gsx;
+
+    last if $startrule eq $rule;
+  }
+
+
+  # return for things we know we can't handle.
+  if (!$BASES_CAN_USE_ALTERNATIONS) {
+    if ($rule =~ /\|/) {
+      # /time to refinance|refinanc\w{1,3}\b.{0,16}\bnow\b/i
+      return;
+    }
+  }
+
+  {
+    # count (...braces...) to ensure the numbers match up
+    my @c = ($rule =~ /(?<!\\)\(/g); my $brace_i = scalar @c;
+       @c = ($rule =~ /(?<!\\)\)/g); my $brace_o = scalar @c;
+    if ($brace_i != $brace_o) { return; }
+  }
+
+  # do the same for [charclasses]
+  {
+    my @c = ($rule =~ /(?<!\\)\[/g); my $brace_i = scalar @c;
+       @c = ($rule =~ /(?<!\\)\]/g); my $brace_o = scalar @c;
+    if ($brace_i != $brace_o) { return; }
+  }
+
+  # and {quantifiers}
+  {
+    my @c = ($rule =~ /(?<!\\)\{/g); my $brace_i = scalar @c;
+       @c = ($rule =~ /(?<!\\)\}/g); my $brace_o = scalar @c;
+    if ($brace_i != $brace_o) { return; }
+  }
+
+  # lookaheads that are just too far for the re2c parser
+  # r your .{0,40}account .{0,40}security
+  if ($rule =~ /\.\{(\d+),?(\d+?)\}/ and ($1+$2 > 20)) {
+    return;
+  }
+
+  # re2xs doesn't like escaped brackets
+  if ($rule =~ /\\:/) {
+    return;
+  }
+
+  # finally, reassemble a usable regexp
+  if ($mods ne '') {
+    $mods = "(?$mods)";
+  }
+  $rule = $mods . $rule;
+
+  return $rule;
+}
+
+sub count_regexp_statements {
+  my $self = shift;
+  my $rule = shift;
+
+  # collapse various common metachar sequences into 1 char,
+  # or their shortest form
+  $rule =~ s/(?<!\\)(?:
+            \[.+?\][\?\*]|
+            \{0\}\?|
+            \{.+?\}\?
+          )//gs;
+
+  $rule =~ s/\[.+?\]/R/gs;
+  $rule =~ s/\{.+?\}/Q/gs;
+  $rule =~ s/.\?//gs;
+  $rule =~ s/.\*//gs;
+
+  return length $rule;
+}
+
+sub de_reverse_multi_char_regexp_statements {
+  my $rule = shift;
+
+  # fix:
+  #    "S\" => "\S"
+  #    "+S\" => "\S+"
+  #    "}41,2{S\" => "\S{2,14}"
+  #    "?}41,2{S\" => "\S{2,14}?"
+
+  $rule =~ s/
+        (
+          \? |
+        )
+        (
+          \}(?:\d*\,)?\d*\{ |
+          \* |
+          \+ |
+          \? |
+        )
+        (.)(\\?)/$4$3$2$1/gsx;
+
+  return $rule;
+}
+
+1;

Modified: spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/Rule2XSBody.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/Rule2XSBody.pm?view=diff&rev=451377&r1=451376&r2=451377
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/Rule2XSBody.pm (original)
+++ spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/Rule2XSBody.pm Fri Sep 29 11:21:10 2006
@@ -33,47 +33,22 @@
   my $self = $class->SUPER::new($mailsaobject);
   bless ($self, $class);
 
-  $self->set_config($mailsaobject->{conf});
-
   return $self;
 }
 
-sub set_config {
-  my ($self, $conf) = @_;
-  my @cmds = ();
-
-  # push (@cmds, {
-  # setting => 'whitelist_from',
-  # type => $Mail::SpamAssassin::Conf::CONF_TYPE_ADDRLIST
-  # });
-
-  $conf->{parser}->register_commands(\@cmds);
-}
-
 ###########################################################################
 
 sub finish_parsing_end {
   my ($self, $params) = @_;
   my $conf = $params->{conf};
-
-  # TODO: need a better way to do this rather than using an env
-  # var as a back channel
-  if ($ENV{'RULE_REGEXP_DUMP_FILE'}) {
-    $conf->{env_regexp_dump_file} = $ENV{'RULE_REGEXP_DUMP_FILE'};
-  }
-
-  $self->zoomify_test_set ($conf, $conf->{body_tests}, 'body');
+  $self->run_test_set ($conf, $conf->{body_tests}, 'body');
 }
 
-sub zoomify_test_set {
+sub run_test_set {
   my ($self, $conf, $test_set, $ruletype) = @_;
   foreach my $pri (keys %{$test_set}) {
     my $nicepri = $pri; $nicepri =~ s/-/neg/g;
     $self->check_all($conf, $test_set->{$pri}, $ruletype.'_'.$nicepri);
-
-    if ($conf->{env_regexp_dump_file}) {
-      $self->extract_all($conf, $test_set->{$pri}, $ruletype.'_'.$nicepri);
-    }
   }
 }
 
@@ -179,435 +154,6 @@
   }
 
   dbg("zoom: run_body_hack for $ruletype done");
-}
-
-###########################################################################
-
-sub extract_all {
-  my ($self, $conf, $rules, $ruletype) = @_;
-  my @good_bases = ();
-  my @failed = ();
-  my $yes = 0;
-  my $no = 0;
-
-  dbg("zoom: base extraction start for type $ruletype");
-
-  $conf->{env_regexp_dump_file} =~ /^(.*)$/;
-  my $dumpfile = $1;
-
-  # attempt to find good "base strings" (simplified regexp subsets) for each
-  # regexp.  We try looking at the regexp from both ends, since there
-  # may be a good long string of text at the end of the rule.
-
-  # require this many chars in a base string, for it to be viable
-  my $min_chars = 4;
-
-  foreach my $name (keys %{$rules}) {
-    my $rule = $rules->{$name};
-
-    # ignore ReplaceTags rules
-    next if ($conf->{rules_to_replace}->{$name});
-
-    my $base  = $self->extract_base($rule, 0);
-    my $base2 = $self->extract_base($rule, 1);
-
-    my $len   = $base  ? $self->count_regexp_statements($base) : 0;
-    my $len2  = $base2 ? $self->count_regexp_statements($base2) : 0;
-
-    if ($base2 && (!$base || ($len2 > $len))) {
-      $base = $base2;
-      $len = $len2;
-    }
-
-    if (!$base || $len < $min_chars) { $base = undef; }
-
-    if ($base) {
-      # dbg("zoom: YES <base>$base</base> <origrule>$rule</origrule>");
-      push @good_bases, { base => $base, orig => $rule, name => $name };
-      $yes++;
-    }
-    else {
-      dbg("zoom: NO $rule");
-      push @failed, { orig => $rule };
-      $no++;
-    }
-  }
-
-  # NOTE: re2c will attempt to provide the longest pattern that matched; e.g.
-  # ("food" =~ "foo" / "food") will return "food".  So therefore if a pattern
-  # subsumes other patterns, we need to return hits for all of them.  We also
-  # need to take care of the case where multiple regexps wind up sharing the
-  # same base.   
-  #
-  # Another gotcha, an exception to the subsumption rule; if one pattern isn't
-  # entirely subsumed (e.g. "food" =~ "foo" / "ood"), then they will be
-  # returned as two hits, correctly.  So we only have to be smart about the
-  # full-subsumption case; overlapping is taken care of for us, by re2c.
-  #
-  # TODO: there's a bug here.  Since the code in extract_base() has been
-  # modified to support more complex regexps, we can no longer simply assume
-  # that if pattern A is not contained in pattern B, that means that pattern B
-  # doesn't subsume it.  Consider, for example, A="foo*bar" and
-  # B="morefobarry"; A is indeed subsumed by B, but we won't be able to test
-  # that without running the A RE match itself somehow against B.
-  # same issue remains with:
-  #
-  #   "foo?bar" / "fobar"
-  #   "fo(?:o|oo|)bar" / "fobar"
-  #   "fo(?:o|oo)?bar" / "fobar"
-  #   "fo(?:o*|baz)bar" / "fobar"
-  #   "(?:fo(?:o*|baz)bar|blargh)" / "fobar"
-  #
-  # it's worse with this:
-  #
-  #   "fo(?:o|oo|)bar" / "foo*bar"
-  #
-  # basically, this is impossible to compute without reimplementing most of
-  # re2c, and it appears the re2c developers don't plan to offer this:
-  # https://sourceforge.net/tracker/index.php?func=detail&aid=1540845&group_id=96864&atid=616203
-
-  open (OUT, ">$dumpfile") or die "cannot write to $dumpfile!";
-  print OUT "name $ruletype\n";
-
-  foreach my $set1 (@good_bases) {
-    my $base1 = $set1->{base};
-    my $orig1 = $set1->{orig};
-    my $key1  = $set1->{name};
-    next if ($base1 eq '' or $key1 eq '');
-
-    print OUT "orig $key1 $orig1\n";
-
-    foreach my $set2 (@good_bases) {
-      next if ($set1 == $set2);
-      next if ($set1->{name} =~ /\b\Q$set2->{name}\E\b/);
-      next if ($set2->{name} =~ /\b\Q$set1->{name}\E\b/);
-
-      my $base2 = $set2->{base};
-      next if ($base2 eq '');
-      next if (length $base1 < length $base2);
-      next if ($base1 !~ /\Q$base2\E/);
-
-      $set1->{name} .= " ".$set2->{name};
-
-      if ($base1 eq $base2) {
-        # an exact duplicate!  kill the latter entirely
-        $set2->{name} = '';
-        $set2->{base} = '';
-      }
-      # otherwise, base2 is just a subset of base1
-
-      # dbg("zoom: subsuming '$base2' into '$base1': $set1->{name}");
-    }
-  }
-
-  foreach my $set (@good_bases) {
-    my $base = $set->{base};
-    my $key  = $set->{name};
-    next unless $base;
-    print OUT "r $base:$key\n";
-  }
-  close OUT or die "close failed on $dumpfile!";
-
-  # TODO: run re2xs automatically here
-
-  warn ("zoom: base extraction complete for $ruletype: yes=$yes no=$no\n");
-}
-
-###########################################################################
-
-# TODO:
-# NO /no.{1,10}P(?:er|re)scription.{1,10}(?:needed|require|necessary)/i
-#     => should extract 'scription' somehow
-# /time to refinance|refinanc\w{1,3}\b.{0,16}\bnow\b/i
-#     => should understand alternations; tricky
-
-sub extract_base {
-  my $self = shift;
-  my $rule = shift;
-  my $is_reversed = shift;
-
-  my $orig = $rule;
-  $rule = Mail::SpamAssassin::Util::regexp_remove_delimiters($rule);
-
-  # remove the regexp modifiers, keep for later
-  my $mods = '';
-  while ($rule =~ s/^\(\?([a-z]*)\)//) { $mods .= $1; }
-
-  # modifier removal
-  while ($rule =~ s/^\(\?-([a-z]*)\)//) {
-    foreach my $modchar (split '', $mods) {
-      $mods =~ s/$modchar//g;
-    }
-  }
-
-  # now: simplify aspects of the regexp.  Bear in mind that we can
-  # simplify as long as we cause the regexp to become more general;
-  # more hits is OK, since false positives will be discarded afterwards
-  # anyway.  Simplification that causes the regexp to *not* hit
-  # stuff that the "real" rule would hit, however, is a bad thing.
-
-  # treat all rules as lowercase for purposes of term extraction?
-  my $output_casei = 1;
-  if ($output_casei) {
-    $rule = lc $rule;
-    $mods =~ s/i//;
-
-    # always case-i: /A(?i:ct) N(?i:ow)/ => /Act Now/
-    $rule =~ s/(?<!\\)\(\?i\:(.*?)\)/$1/gs;
-
-    # always case-i: /A(?-i:ct)/ => /Act/
-    $rule =~ s/(?<!\\)\(\?-i\:(.*?)\)/$1/gs;
-
-    # remove (?i)
-    $rule =~ s/\(\?i\)//gs;
-  }
-
-  # remove /m and /s modifiers
-  $mods =~ s/m//;
-  $mods =~ s/s//;
-
-  # remove (^|\b)'s
-  # T_KAM_STOCKTIP23 /(EXTREME INNOVATIONS|(^|\b)EXTI($|\b))/is
-  $rule =~ s/\(\^\|\\b\)//gs;
-  $rule =~ s/\(\$\|\\b\)//gs;
-  $rule =~ s/\(\\b\|\^\)//gs;
-  $rule =~ s/\(\\b\|\$\)//gs;
-
-  # remove \b's
-  $rule =~ s/\\b//gs;
-
-  # remove the "?=" trick
-  # (?=[dehklnswxy])(horny|nasty|hot|wild|young|....etc...)
-  $rule =~ s/\(\?\=\[[^\]]+\]\)//gs;
-
-  # if there are anchors, give up; we can't get much 
-  # faster than these anyway
-  return if $rule =~ /^\(?(?:\^|\\A)/;
-  return if $rule =~ /(?:\$|\\Z)\)?$/;
-
-  # simplify (?:..) to (..)
-  $rule =~ s/\(\?:/\(/g;
-
-  # here's the trick; we can use the truncate regexp below simply by
-  # reversing the string and taking care to fix "\z" 2-char escapes.
-  # TODO: this breaks stuff like "\s+" or "\S{4,12}", but since the
-  # truncation regexp below is pretty simple-minded, that's ok.
-  if ($is_reversed) {
-    $rule = join ('', reverse (split '', $rule));
-    $rule = de_reverse_multi_char_regexp_statements($rule);
-  }
-
-  my $keep_alternations = 0;    # /(foo|bar|baz)/
-  my $keep_quantifiers = 0;     # /foo.*bar/ or /foo*bar/ or /foooo?bar/
-  my $keep_classes = 0;         # /fo[opqr]bar/
-
-  # truncate the pattern at the first unhandleable metacharacter
-  # or range
-  $rule =~ s/(?<!\\)(?:
-              \(\?\!|
-              \\[abce-rt-vx-z]|
-              \\[ABCE-RT-VX-Z]
-            ).*$//gsx;
-
-  $keep_classes or $rule =~ s/(?<!\\)(?:
-              \\\w|
-              \.|
-              \[|
-              \]
-            ).*$//gsx;
-
-  $keep_quantifiers or $rule =~ s/(?<!\\)(?:
-              .\*|	# remove the quantified char, too
-              .\+|
-              .\?|
-              .\{
-            ).*$//gsx;
-
-  $keep_alternations or $rule =~ s/(?<!\\)(?:
-              \(|
-              \)
-            ).*$//gsx;
-
-  if ($is_reversed) {
-    $rule = join ('', reverse (split '', $rule));
-    $rule = de_reverse_multi_char_regexp_statements($rule);
-  }
-
-  # drop this one, after the reversing
-  $rule =~ s/\(\?\!.*$//gsx;
-
-  # still problematic; kill all "x?" statements
-  $rule =~ s/.\?.*$//gsx;
-
-  # simplify (..)? and (..|) to (..|z{0})
-  # this wierd construct is to work around an re2c bug; (..|) doesn't
-  # do what it should
-  if ($keep_alternations) {
-    $rule =~ s/\((.*?)\)\?/\($1\|z{0}\)/gs;
-    $rule =~ s/\((.*?)\|\)/\($1\|z{0}\)/gs;
-    $rule =~ s/\(\|(.*?)\)/\($1\|z{0}\)/gs;
-  }
-
-  # re2xs doesn't like escaped brackets;
-  # brackets in general, in fact
-  $rule =~ s/\:.*//g;
-
-  # replace \s, \d, \S with char classes that (nearly) match them
-  # TODO: \w, \W need to know about utf-8, ugh
-
-  # [a-f\s]
-  $rule =~ s/(\[[^\]]*)\\s([^\]]*\])/$1 \\t\\n$2/gs;
-  # [a-f\S]: we can't support this, cut the string here
-  $rule =~ s/(\[[^\]]*)\\S([^\]]*\]).*//gs;
-  $rule =~ s/(\[[^\]]*)\\d([^\]]*\])/${1}0-9$2/gs;
-  $rule =~ s/(\[[^\]]*)\\D([^\]]*\]).*//gs;
-  $rule =~ s/(\[[^\]]*)\\w([^\]]*\])/${1}a-z0-9$2/gs;
-  $rule =~ s/(\[[^\]]*)\\W([^\]]*\]).*//gs;
-
-  # \s, etc. outside of existing char class blocks
-  $rule =~ s/\\S/[^ \\t\\n]/gs;
-  $rule =~ s/\\s/[ \\t\\n]/gs;
-  $rule =~ s/\\S/[^ \\t\\n]/gs;
-  $rule =~ s/\\d/[0-9]/gs;
-  $rule =~ s/\\D/[^0-9]/gs;
-  $rule =~ s/\\w/[_a-z0-9]/gs;
-  $rule =~ s/\\W/[^_a-z0-9]/gs;
-
-  # loop here, to catch __DRUGS_SLEEP1:
-  # 0,3}([ \t\n]|z{0})
-  while (1) 
-  {
-    my $startrule = $rule;
-
-    # exit early if the pattern starts with a class in a group;
-    # we can't reliably kill these
-    # r ([a-z0-9]+\*[,[ \t\n]]+){2}:TVD_BODY_END_STAR
-    if ($rule =~ /^\((?:
-              \.?[\*\?\+] |
-              \.?\{?[^\{]*\} |
-              [^\(]*\) |
-              \[ |
-              [^\[]*\]
-            )/sx)
-    {
-      return;
-    }
-
-    # kill quantifiers right at the start of the string.
-    # this (a) reduces algorithmic complexity of the produced code,
-    # and (b) can also improve overall speed as a side-effect of (a)
-    $rule =~ s/^(?:
-              \.?[\*\?\+] |
-              \.?\{?[^\{]*\} |
-              [^\(]*\) |
-              \[?[^\[]*\]
-            )+//gsx;
-
-    # kill quantifiers right at the end of the string, too;
-    # they can hide hits if they overlap with other patterns
-    0 and $rule =~ s/(?:
-              \.[\*\?\+] |
-              \.\{?[^\{]*\} |
-              \. |
-              \([^\)]* |
-              \[[^\[]*\]?
-            )+$//gsx;
-
-    last if $startrule eq $rule;
-  }
-
-
-  # return for things we know we can't handle.
-  if (!$keep_alternations) {
-    if ($rule =~ /\|/) {
-      # /time to refinance|refinanc\w{1,3}\b.{0,16}\bnow\b/i
-      return;
-    }
-  }
-
-  {
-    # count (...braces...) to ensure the numbers match up
-    my @c = ($rule =~ /(?<!\\)\(/g); my $brace_i = scalar @c;
-       @c = ($rule =~ /(?<!\\)\)/g); my $brace_o = scalar @c;
-    if ($brace_i != $brace_o) { return; }
-  }
-
-  # do the same for [charclasses]
-  {
-    my @c = ($rule =~ /(?<!\\)\[/g); my $brace_i = scalar @c;
-       @c = ($rule =~ /(?<!\\)\]/g); my $brace_o = scalar @c;
-    if ($brace_i != $brace_o) { return; }
-  }
-
-  # and {quantifiers}
-  {
-    my @c = ($rule =~ /(?<!\\)\{/g); my $brace_i = scalar @c;
-       @c = ($rule =~ /(?<!\\)\}/g); my $brace_o = scalar @c;
-    if ($brace_i != $brace_o) { return; }
-  }
-
-  # lookaheads that are just too far for the re2c parser
-  # r your .{0,40}account .{0,40}security
-  if ($rule =~ /\.\{(\d+),?(\d+?)\}/ and ($1+$2 > 20)) {
-    return;
-  }
-
-  # re2xs doesn't like escaped brackets
-  if ($rule =~ /\\:/) {
-    return;
-  }
-
-  # finally, reassemble a usable regexp
-  if ($mods ne '') {
-    $mods = "(?$mods)";
-  }
-  $rule = $mods . $rule;
-
-  return $rule;
-}
-
-sub count_regexp_statements {
-  my $self = shift;
-  my $rule = shift;
-
-  # collapse various common metachar sequences into 1 char,
-  # or their shortest form
-  $rule =~ s/(?<!\\)(?:
-            \[.+?\][\?\*]|
-            \{0\}\?|
-            \{.+?\}\?
-          )//gs;
-
-  $rule =~ s/\[.+?\]/R/gs;
-  $rule =~ s/\{.+?\}/Q/gs;
-  $rule =~ s/.\?//gs;
-  $rule =~ s/.\*//gs;
-
-  return length $rule;
-}
-
-sub de_reverse_multi_char_regexp_statements {
-  my $rule = shift;
-
-  # fix:
-  #    "S\" => "\S"
-  #    "+S\" => "\S+"
-  #    "}41,2{S\" => "\S{2,14}"
-  #    "?}41,2{S\" => "\S{2,14}?"
-
-  $rule =~ s/
-        (
-          \? |
-        )
-        (
-          \}(?:\d*\,)?\d*\{ |
-          \* |
-          \+ |
-          \? |
-        )
-        (.)(\\?)/$4$3$2$1/gsx;
-
-  return $rule;
 }
 
 1;

Modified: spamassassin/branches/jm_re2c_hacks/rule2xs/re2xs
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/rule2xs/re2xs?view=diff&rev=451377&r1=451376&r2=451377
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/rule2xs/re2xs (original)
+++ spamassassin/branches/jm_re2c_hacks/rule2xs/re2xs Fri Sep 29 11:21:10 2006
@@ -101,6 +101,8 @@
     }
 
     my ($regexp, $reason) = /^r (.*):(.*)$/;
+    die "no 'r REGEXP:REASON' in $_" unless defined $regexp;
+
     if ($regexp =~ /^\.\*/) {
       push @dot_star, "$regexp:$reason";
       next;

Modified: spamassassin/branches/jm_re2c_hacks/rules/rule2xs.pre
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/rules/rule2xs.pre?view=diff&rev=451377&r1=451376&r2=451377
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/rules/rule2xs.pre (original)
+++ spamassassin/branches/jm_re2c_hacks/rules/rule2xs.pre Fri Sep 29 11:21:10 2006
@@ -1,3 +1,4 @@
 
+loadplugin Mail::SpamAssassin::Plugin::BodyRuleBaseExtractor
 loadplugin Mail::SpamAssassin::Plugin::Rule2XSBody