You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2006/10/06 13:42:26 UTC

svn commit: r453556 [1/3] - in /spamassassin/branches/jm_re2c_hacks: ./ lib/Mail/SpamAssassin/Plugin/ rule2xs/RabinKarpAccel-0.01/ rule2xs/RabinKarpAccel-0.01/lib/ rule2xs/RabinKarpAccel-0.01/t/

Author: jm
Date: Fri Oct  6 04:42:25 2006
New Revision: 453556

URL: http://svn.apache.org/viewvc?view=rev&rev=453556
Log:
check in Rabin-Karp code.  it works, but sadly winds up slower than normal body rules in 'real-world' mass-checks; the overhead outweighs the efficiency benefits of parallelized matching vs sequential regexp matches

Added:
    spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/RabinKarpBody.pm
    spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/
    spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/Changes
    spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/MANIFEST
    spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/META.yml
    spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/Makefile.PL
    spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/README
    spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/RabinKarpAccel.xs
    spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/lib/
    spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/lib/RabinKarpAccel.pm
    spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/ppport.h
    spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/t/
    spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/t/RabinKarpAccel.t
Modified:
    spamassassin/branches/jm_re2c_hacks/MANIFEST
    spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm

Modified: spamassassin/branches/jm_re2c_hacks/MANIFEST
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/MANIFEST?view=diff&rev=453556&r1=453555&r2=453556
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/MANIFEST (original)
+++ spamassassin/branches/jm_re2c_hacks/MANIFEST Fri Oct  6 04:42:25 2006
@@ -497,3 +497,6 @@
 t/uribl.t
 t/shortcircuit.t
 t/spamc_y.t
+lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm
+lib/Mail/SpamAssassin/Plugin/RabinKarpBody.pm
+lib/Mail/SpamAssassin/Plugin/Rule2XSBody.pm

Modified: spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm?view=diff&rev=453556&r1=453555&r2=453556
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm (original)
+++ spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm Fri Oct  6 04:42:25 2006
@@ -62,31 +62,42 @@
 sub finish_parsing_end {
   my ($self, $params) = @_;
   my $conf = $params->{conf};
+  $self->extract_bases($conf);
+}
+
+sub extract_bases {
+  my ($self, $conf) = @_;
 
   # TODO: need a better way to do this rather than using an env
   # var as a back channel
   my $rawf = $ENV{'RULE_REGEXP_DUMP_FILE'};
-  return unless $rawf;
+  my $f;
 
-  $rawf =~ /^(.*)$/;
-  my $f = $1;       # untaint; allow anything here, it's from %ENV and safe
+  if ($rawf) {
+    $rawf =~ /^(.*)$/;
+    $f = $1;       # untaint; allow anything here, it's from %ENV and safe
+  }
 
-  $self->extract_bases_for_set ($f, $conf, $conf->{body_tests}, 'body');
+  $self->extract_set($f, $conf, $conf->{body_tests}, 'body');
 }
 
-sub extract_bases_for_set {
+sub extract_set {
   my ($self, $dumpfile, $conf, $test_set, $ruletype) = @_;
 
   foreach my $pri (keys %{$test_set}) {
     my $nicepri = $pri; $nicepri =~ s/-/neg/g;
-    $self->extract_all($dumpfile, $conf, $test_set->{$pri}, $ruletype.'_'.$nicepri);
+    $self->extract_set_pri($conf, $test_set->{$pri}, $ruletype.'_'.$nicepri);
+
+    if ($dumpfile) {
+      $self->dump_base_strings($dumpfile, $conf, $ruletype.'_'.$nicepri);
+    }
   }
 }
 
 ###########################################################################
 
-sub extract_all {
-  my ($self, $dumpfile, $conf, $rules, $ruletype) = @_;
+sub extract_set_pri {
+  my ($self, $conf, $rules, $ruletype) = @_;
 
   my @good_bases = ();
   my @failed = ();
@@ -106,6 +117,7 @@
     my $rule = $rules->{$name};
 
     # ignore ReplaceTags rules
+    # TODO: need cleaner way to do this
     next if ($conf->{rules_to_replace}->{$name});
 
     my $base  = $self->extract_base($rule, 0);
@@ -166,8 +178,8 @@
   # re2c, and it appears the re2c developers don't plan to offer this:
   # https://sourceforge.net/tracker/index.php?func=detail&aid=1540845&group_id=96864&atid=616203
 
-  open (OUT, ">$dumpfile") or die "cannot write to $dumpfile!";
-  print OUT "name $ruletype\n";
+  $conf->{base_orig}->{$ruletype} = { };
+  $conf->{base_string}->{$ruletype} = { };
 
   foreach my $set1 (@good_bases) {
     my $base1 = $set1->{base};
@@ -175,7 +187,7 @@
     my $key1  = $set1->{name};
     next if ($base1 eq '' or $key1 eq '');
 
-    print OUT "orig $key1 $orig1\n";
+    $conf->{base_orig}->{$ruletype}->{$key1} = $orig1;
 
     foreach my $set2 (@good_bases) {
       next if ($set1 == $set2);
@@ -204,15 +216,34 @@
     my $base = $set->{base};
     my $key  = $set->{name};
     next unless $base;
-    print OUT "r $base:$key\n";
+    $conf->{base_string}->{$ruletype}->{$base} = $key;
   }
-  close OUT or die "close failed on $dumpfile!";
 
   warn ("zoom: base extraction complete for $ruletype: yes=$yes no=$no\n");
 }
 
 ###########################################################################
 
+sub dump_base_strings {
+  my ($self, $dumpfile, $conf, $ruletype) = @_;
+
+  open (OUT, ">$dumpfile") or die "cannot write to $dumpfile!";
+  print OUT "name $ruletype\n";
+
+  foreach my $key1 (sort keys %{$conf->{base_orig}->{$ruletype}}) {
+    print OUT "orig $key1 $conf->{base_orig}->{$ruletype}->{$key1}\n";
+  }
+
+  foreach my $key (sort keys %{$conf->{base_string}->{$ruletype}}) {
+    print OUT "r $key:$conf->{base_string}->{$ruletype}->{$key}\n";
+  }
+  close OUT or die "close failed on $dumpfile!";
+
+  warn ("zoom: bases written to '$dumpfile'\n");
+}
+
+###########################################################################
+
 # TODO:
 # NO /no.{1,10}P(?:er|re)scription.{1,10}(?:needed|require|necessary)/i
 #     => should extract 'scription' somehow
@@ -256,6 +287,10 @@
 
     # remove (?i)
     $rule =~ s/\(\?i\)//gs;
+  }
+  else {
+    return if $rule =~ /\(\?i\)/;
+    return if $mods =~ /i/;
   }
 
   # remove /m and /s modifiers

Added: spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/RabinKarpBody.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/RabinKarpBody.pm?view=auto&rev=453556
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/RabinKarpBody.pm (added)
+++ spamassassin/branches/jm_re2c_hacks/lib/Mail/SpamAssassin/Plugin/RabinKarpBody.pm Fri Oct  6 04:42:25 2006
@@ -0,0 +1,140 @@
+# <@LICENSE>
+# Copyright 2004 Apache Software Foundation
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </...@LICENSE>
+
+package Mail::SpamAssassin::Plugin::RabinKarpBody;
+
+use Mail::SpamAssassin::Plugin;
+use Mail::SpamAssassin::Logger;
+use RabinKarpAccel;
+use Mail::SpamAssassin::Plugin::BodyRuleBaseExtractor;
+
+use strict;
+use warnings;
+use bytes;
+
+use vars qw(@ISA);
+@ISA = qw(Mail::SpamAssassin::Plugin);
+
+sub new {
+  my $class = shift;
+  my $mailsaobject = shift;
+  $class = ref($class) || $class;
+  my $self = $class->SUPER::new($mailsaobject);
+  bless ($self, $class);
+
+  return $self;
+}
+
+###########################################################################
+
+sub finish_parsing_end {
+  my ($self, $params) = @_;
+  my $conf = $params->{conf};
+
+  my $basextor = Mail::SpamAssassin::Plugin::BodyRuleBaseExtractor->new(
+            $self->{main});
+  $basextor->extract_bases($conf);
+
+  $conf->{skip_body_rules} = { };
+  $self->setup_test_set ($conf, $conf->{body_tests}, 'body');
+}
+
+sub setup_test_set {
+  my ($self, $conf, $test_set, $ruletype) = @_;
+  foreach my $pri (keys %{$test_set}) {
+    my $nicepri = $pri; $nicepri =~ s/-/neg/g;
+    $self->setup_test_set_pri($conf, $test_set->{$pri}, $ruletype.'_'.$nicepri);
+  }
+}
+
+sub setup_test_set_pri {
+  my ($self, $conf, $rules, $ruletype) = @_;
+
+  $conf->{$ruletype}->{rkhashes} = { };
+  foreach my $base (keys %{$conf->{base_string}->{$ruletype}}) {
+    next unless (length $base > 4);
+    my @rules = split(' ', $conf->{base_string}->{$ruletype}->{$base});
+    RabinKarpAccel::add_bitvec($conf->{$ruletype}->{rkhashes}, lc $base, [ @rules ]);
+    foreach my $rule (@rules) {
+      $conf->{skip_body_rules}->{$rule} = 1;
+    }
+  }
+}
+
+###########################################################################
+
+sub run_body_hack {
+  my ($self, $params) = @_;
+
+  return unless ($params->{ruletype} eq 'body');
+
+  my $pri = $params->{priority};
+  my $nicepri = $params->{priority}; $nicepri =~ s/-/neg/g;
+  my $ruletype = ($params->{ruletype}.'_'.$nicepri);
+  my $scanner = $params->{permsgstatus};
+  my $conf = $scanner->{conf};
+
+  my $rkhashes = $conf->{$ruletype}->{rkhashes};
+  if (!$rkhashes || (scalar keys %{$conf->{$ruletype}->{rkhashes}} <= 0))
+  {
+    dbg("zoom: run_body_hack for $ruletype skipped, no rules");
+    return;
+  }
+
+  my $do_dbg = (would_log('dbg', 'zoom') > 1);
+  my $scoresptr = $conf->{scores};
+
+  dbg("zoom: run_body_hack for $ruletype start");
+
+  {
+    no strict "refs";
+    foreach my $line (@{$params->{lines}})
+    {
+      my $results = RabinKarpAccel::scan_string($rkhashes, lc $line);
+      next unless $results;
+
+      my %alreadydone = ();
+      foreach my $rulename (@{$results})
+      {
+        # only try each rule once per line
+	next if exists $alreadydone{$rulename};
+	$alreadydone{$rulename} = undef;
+
+        # ignore 0-scored rules, of course
+	next unless $scoresptr->{$rulename};
+
+        # TODO: it would be very useful to provide an optional
+        # means of instrumenting the ruleset, so that we can
+        # find out when the base matched but the full RE didn't.
+
+	# if ($do_dbg) {
+	# dbg("zoom: base found for $rulename: $line");
+	# }
+
+        # run the real regexp -- on this line alone
+        &{'Mail::SpamAssassin::PerMsgStatus::'.$rulename.'_one_line_body_test'}
+                    ($scanner, $line);
+      }
+    }
+    use strict "refs";
+  }
+
+  dbg("zoom: run_body_hack for $ruletype done");
+}
+
+###########################################################################
+
+1;

Added: spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/Changes
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/Changes?view=auto&rev=453556
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/Changes (added)
+++ spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/Changes Fri Oct  6 04:42:25 2006
@@ -0,0 +1,6 @@
+Revision history for Perl extension RabinKarpAccel.
+
+0.01  Mon Oct  2 14:11:46 2006
+	- original version; created by h2xs 1.23 with options
+		-b 5.6.1 -A -f -n RabinKarpAccel
+

Added: spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/MANIFEST
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/MANIFEST?view=auto&rev=453556
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/MANIFEST (added)
+++ spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/MANIFEST Fri Oct  6 04:42:25 2006
@@ -0,0 +1,9 @@
+RabinKarpAccel.xs
+Changes
+Makefile.PL
+MANIFEST
+ppport.h
+README
+t/RabinKarpAccel.t
+lib/RabinKarpAccel.pm
+META.yml                                 Module meta-data (added by MakeMaker)

Added: spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/META.yml
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/META.yml?view=auto&rev=453556
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/META.yml (added)
+++ spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/META.yml Fri Oct  6 04:42:25 2006
@@ -0,0 +1,10 @@
+# http://module-build.sourceforge.net/META-spec.html
+#XXXXXXX This is a prototype!!!  It will change in the future!!! XXXXX#
+name:         RabinKarpAccel
+version:      0.01
+version_from: lib/RabinKarpAccel.pm
+installdirs:  site
+requires:
+
+distribution_type: module
+generated_by: ExtUtils::MakeMaker version 6.30_01

Added: spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/Makefile.PL
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/Makefile.PL?view=auto&rev=453556
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/Makefile.PL (added)
+++ spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/Makefile.PL Fri Oct  6 04:42:25 2006
@@ -0,0 +1,17 @@
+use 5.006001;
+use ExtUtils::MakeMaker;
+# See lib/ExtUtils/MakeMaker.pm for details of how to influence
+# the contents of the Makefile that is written.
+WriteMakefile(
+    NAME              => 'RabinKarpAccel',
+    VERSION_FROM      => 'lib/RabinKarpAccel.pm', # finds $VERSION
+    PREREQ_PM         => {}, # e.g., Module::Name => 1.1
+    ($] >= 5.005 ?     ## Add these new keywords supported since 5.005
+      (ABSTRACT_FROM  => 'lib/RabinKarpAccel.pm', # retrieve abstract from module
+       AUTHOR         => 'A. U. Thor <jm@>') : ()),
+    LIBS              => [''], # e.g., '-lm'
+    DEFINE            => '', # e.g., '-DHAVE_SOMETHING'
+    INC               => '-I.', # e.g., '-I. -I/usr/include/other'
+	# Un-comment this if you add C files to link with later:
+    # OBJECT            => '$(O_FILES)', # link all the C files too
+);

Added: spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/README
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/README?view=auto&rev=453556
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/README (added)
+++ spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/README Fri Oct  6 04:42:25 2006
@@ -0,0 +1,40 @@
+RabinKarpAccel version 0.01
+========================
+
+The README is used to introduce the module and provide instructions on
+how to install the module, any machine dependencies it may have (for
+example C compilers and installed libraries) and any other information
+that should be provided before the module is installed.
+
+A README file is required for CPAN modules since CPAN extracts the
+README file from a module distribution so that people browsing the
+archive can use it get an idea of the modules uses. It is usually a
+good idea to provide version information here so that people can
+decide whether fixes for the module are worth downloading.
+
+INSTALLATION
+
+To install this module type the following:
+
+   perl Makefile.PL
+   make
+   make test
+   make install
+
+DEPENDENCIES
+
+This module requires these other modules and libraries:
+
+  blah blah blah
+
+COPYRIGHT AND LICENCE
+
+Put the correct copyright and licence information here.
+
+Copyright (C) 2006 by A. U. Thor
+
+This library is free software; you can redistribute it and/or modify
+it under the same terms as Perl itself, either Perl version 5.8.7 or,
+at your option, any later version of Perl 5 you may have available.
+
+

Added: spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/RabinKarpAccel.xs
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/RabinKarpAccel.xs?view=auto&rev=453556
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/RabinKarpAccel.xs (added)
+++ spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/RabinKarpAccel.xs Fri Oct  6 04:42:25 2006
@@ -0,0 +1,253 @@
+#include "EXTERN.h"
+#include "perl.h"
+#include "XSUB.h"
+
+#include "ppport.h"
+
+/* see http://www.eecs.harvard.edu/~ellard/Courses/sq98_root.pdf , pp 73-80
+ * for the Rabin-Karp algorithm definition
+ */
+#define fast_b  ((long) 257)
+#define fast_m  ((long) 1024)
+#define ksize   4
+
+
+static void av_push_all (AV *to, AV *from)
+{
+  int i, len;
+  SV **svptr;
+
+  len = av_len(from);
+  for (i = 0; i <= len; i++) {
+    svptr = av_fetch(from, i, 0);
+    if (svptr == NULL) {
+      continue;     /* this can happen, it seems */
+    }
+
+    //SvREFCNT_inc(*svptr);
+    av_push (to, *svptr);
+  }
+}
+
+static void add_rk_hit (AV *results, HV *keys, SV **keysv)
+{
+  AV *rulesav;
+  int i, len;
+
+  /* add rule names to results AV */
+  rulesav = (AV *) SvRV(*keysv);
+
+  len = av_len(rulesav);
+  for (i = 0; i <= len; i++) {
+    SV **svptr = av_fetch(rulesav, i, 0);
+    if (svptr == NULL) {
+      continue;     /* this can happen, it seems */
+    }
+
+    //SvREFCNT_inc(*svptr);
+    av_push (results, *svptr);
+  }
+}
+
+static char *
+get_flut_str (HV *keys)
+{
+  SV **mapptr;
+  char buf[(int) fast_m];
+  SV *newmap;
+  char *flut_str;
+  STRLEN maplen;
+
+  mapptr = hv_fetch (keys, "*BITMAP", 7, 0);
+
+  /* create the map if it doesn't exist */
+  if (mapptr == NULL || *mapptr == NULL)
+  {
+    Zero((void *) buf, (int) fast_m, char);
+    newmap = newSVpvn(buf, (int) fast_m);       /* will take a copy */
+    hv_store (keys, "*BITMAP", 7, newmap, 0);
+    mapptr = &newmap;
+  }
+
+  flut_str = (char *) SvPV(*mapptr, maplen);
+  if (maplen != (int) fast_m) {
+    die ("oops! maplen shrunk to %d", maplen);
+  }
+
+  return flut_str;
+}
+
+static void
+set_in_flut (HV *keys, int P_hash)
+{
+  char *flut_str;
+
+  if (P_hash >= (int) fast_m) {
+    die ("oops! P_hash %d > maplen %d", P_hash, (int) fast_m);
+  }
+  flut_str = get_flut_str(keys);
+  flut_str[P_hash] = (char) 1;
+}
+
+
+static unsigned long
+rk_exp_mod (unsigned long x, unsigned long n, unsigned long m)
+{
+  unsigned long square, exp;
+
+  if (n == 0) {
+    return 1;
+  }
+  else if (n == 1) {
+    return (x % m);
+  }
+  else {
+    square = (x * x) % m;
+    exp = rk_exp_mod (square, n / 2, m);
+    if (n % 2 == 0) {
+      return (exp % m);
+    } else {
+      return ((exp * x) % m);
+    }
+  }
+}
+
+static long
+rk_hash (unsigned char *str, long len, long b, long m)
+{
+  long i;
+  long value = 0;
+  long power = 1;
+
+  for (i = len - 1; i >= 0; i--) {
+    value += (power * str [i]);
+    value %= m;
+    power *= b;
+    power %= m;
+  }
+  return (value);
+}
+
+static void
+rk_search (AV *results, HV *keys, unsigned char *T, long T_len)
+{
+  long top_one;
+  long T_hash;
+  long i;
+  SV *hashkey;
+  char *hashkeystr;
+  STRLEN len;
+  SV **keysv;
+  char *flut_str;
+
+  flut_str = get_flut_str(keys);
+  top_one = rk_exp_mod (fast_b, ksize, fast_m);
+  T_hash = rk_hash (T, ksize, fast_b, fast_m);
+
+  for (i = 0; i <= T_len - ksize; i++) {
+    /* do we have a hash hit? */
+    if (flut_str[(int) T_hash] != (char) 0) {
+      hashkey = sv_2mortal(newSVpvf("%d", (int) T_hash));
+      hashkeystr = SvPV(hashkey, len);
+      if ((keysv = hv_fetch (keys, hashkeystr, len, 0)) != NULL)
+      {
+        /* copy the rule name SV ptrs to the results AV */
+        add_rk_hit(results, keys, keysv);
+      }
+    }
+
+    /* the bit-shifting Karp-Rabin sliding hash -- bit-shifts are fast */
+    T_hash *= fast_b;
+    T_hash -= ((T[i] * top_one) & (fast_m - 1));
+    T_hash += T[i + ksize];
+    T_hash &= (fast_m - 1);
+    if (T_hash < 0) { T_hash += fast_m; }
+  }
+}
+
+
+
+MODULE = RabinKarpAccel		PACKAGE = RabinKarpAccel		
+
+PROTOTYPES: DISABLE
+
+void
+add_bitvec(bvhash, str, rulesary)
+        SV* bvhash
+        SV* str
+        SV* rulesary
+
+  PREINIT:
+        unsigned char *pstart;
+        unsigned char *pend;
+        STRLEN plen;
+        HV *bvhv;
+        SV *hashkey;
+        char *hashkeystr;
+        STRLEN len;
+        long P_hash;
+        SV **svptr;
+
+  CODE:
+        if (!SvROK(bvhash) || (SvTYPE(SvRV(bvhash)) != SVt_PVHV)) {
+          die("bad type for bvhash");
+        }
+        bvhv = (HV *) SvRV(bvhash);
+
+        if (!SvROK(rulesary) || (SvTYPE(SvRV(rulesary)) != SVt_PVAV)) {
+          die("bad type for rulesary");
+        }
+
+        pstart = (unsigned char *) SvPVutf8(str, plen);
+        pend = pstart + plen;
+
+        P_hash = rk_hash (pstart, ksize, fast_b, fast_m);
+
+        /* add the contents of @{$rulesary} to the bvhv hash under 
+         * the key "P_hash" */
+        hashkey = newSVpvf("%d", (int) P_hash);
+        hashkeystr = SvPV(hashkey, len);
+        svptr = hv_fetch (bvhv, hashkeystr, len, 1);
+
+        if (svptr == NULL || *svptr == NULL ||
+          !SvROK(*svptr) ||
+          (SvTYPE(SvRV(*svptr)) != SVt_PVAV))
+        {
+          SvREFCNT_inc(rulesary);
+          hv_store (bvhv, hashkeystr, len, rulesary, 0);
+        } else {
+          av_push_all ((AV *) SvRV(*svptr), (AV *) SvRV(rulesary));
+        }
+
+        /* ensure we set the flag char in the fast lookup table, too */
+        set_in_flut(bvhv, (int) P_hash);
+
+SV *
+scan_string(bvhash, psv)
+        SV* bvhash
+        SV* psv
+
+  PREINIT:
+        unsigned char *pstart;
+        unsigned char *pend;
+        STRLEN plen;
+        AV *results;
+        HV *bvhv;
+
+  CODE:
+        if (!SvROK(bvhash) || (SvTYPE(SvRV(bvhash)) != SVt_PVHV)) {
+          die("bad type for bvhash");
+        }
+        bvhv = (HV *) SvRV(bvhash);
+
+        pstart = (unsigned char *) SvPVutf8(psv, plen);
+        pend = pstart + plen;
+        results = (AV *) sv_2mortal((SV *) newAV());
+
+        rk_search (results, bvhv, pstart, plen);
+
+        RETVAL = newRV((SV *) results);
+    OUTPUT:
+        RETVAL
+
+

Added: spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/lib/RabinKarpAccel.pm
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/lib/RabinKarpAccel.pm?view=auto&rev=453556
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/lib/RabinKarpAccel.pm (added)
+++ spamassassin/branches/jm_re2c_hacks/rule2xs/RabinKarpAccel-0.01/lib/RabinKarpAccel.pm Fri Oct  6 04:42:25 2006
@@ -0,0 +1,86 @@
+package RabinKarpAccel;
+
+use 5.006001;
+use strict;
+use warnings;
+
+require Exporter;
+
+our @ISA = qw(Exporter);
+
+# Items to export into callers namespace by default. Note: do not export
+# names by default without a very good reason. Use EXPORT_OK instead.
+# Do not simply export all your public functions/methods/constants.
+
+# This allows declaration	use RabinKarpAccel ':all';
+# If you do not need this, moving things directly into @EXPORT or @EXPORT_OK
+# will save memory.
+our %EXPORT_TAGS = ( 'all' => [ qw(
+	
+) ] );
+
+our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
+
+our @EXPORT = qw(
+	
+);
+
+our $VERSION = '0.01';
+
+require XSLoader;
+XSLoader::load('RabinKarpAccel', $VERSION);
+
+# Preloaded methods go here.
+
+1;
+__END__
+# Below is stub documentation for your module. You'd better edit it!
+
+=head1 NAME
+
+RabinKarpAccel - Perl extension for blah blah blah
+
+=head1 SYNOPSIS
+
+  use RabinKarpAccel;
+  blah blah blah
+
+=head1 DESCRIPTION
+
+Stub documentation for RabinKarpAccel, created by h2xs. It looks like the
+author of the extension was negligent enough to leave the stub
+unedited.
+
+Blah blah blah.
+
+=head2 EXPORT
+
+None by default.
+
+
+
+=head1 SEE ALSO
+
+Mention other useful documentation such as the documentation of
+related modules or operating system documentation (such as man pages
+in UNIX), or any relevant external documentation such as RFCs or
+standards.
+
+If you have a mailing list set up for your module, mention it here.
+
+If you have a web site set up for your module, mention it here.
+
+=head1 AUTHOR
+
+A. U. Thor, E<lt>jm@E<gt>
+
+=head1 COPYRIGHT AND LICENSE
+
+Copyright (C) 2006 by A. U. Thor
+
+This library is free software; you can redistribute it and/or modify
+it under the same terms as Perl itself, either Perl version 5.8.7 or,
+at your option, any later version of Perl 5 you may have available.
+
+
+=cut