You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2007/01/22 14:04:41 UTC
svn commit: r498601 - in /spamassassin/trunk/masses/rule-dev: phrase-extract-in-log seek-phrases-in-corpus

Author: jm
Date: Mon Jan 22 05:04:40 2007
New Revision: 498601

URL: http://svn.apache.org/viewvc?view=rev&rev=498601
Log:
add a hack-in-progress to SVN; seek-phrases-in-corpus. Given a small corpus of spam, and a corpus of ham, seek out common phrases that appear only in the spam and would make good phrase rules, using a (simple but relatively memory-efficient) BLAST-style algorithm.

Added:
    spamassassin/trunk/masses/rule-dev/phrase-extract-in-log   (with props)
    spamassassin/trunk/masses/rule-dev/seek-phrases-in-corpus   (with props)

Added: spamassassin/trunk/masses/rule-dev/phrase-extract-in-log
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/rule-dev/phrase-extract-in-log?view=auto&rev=498601
==============================================================================
--- spamassassin/trunk/masses/rule-dev/phrase-extract-in-log (added)
+++ spamassassin/trunk/masses/rule-dev/phrase-extract-in-log Mon Jan 22 05:04:40 2007
@@ -0,0 +1,277 @@
+#!/usr/bin/perl
+
+=head1 NAME
+
+phrase-extract-in-log - extract good-looking rule phrases from a text-dump mc log
+
+=cut
+
+# <@LICENSE>
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at:
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </...@LICENSE>
+
+# ---------------------------------------------------------------------------
+
+use warnings;
+use strict;
+
+my $MAX_TEXT_IN_MESSAGE = 32678;        # bytes of message examined
+
+my $REQUIRE_PERCENT_SPAM_HITS = 1;      # 1% hitrate reqd
+
+my $fh = shift @ARGV;
+my $fs = shift @ARGV;
+die "usage: phrase-extract-in-log hamlog spamlog" unless ($fs && $fh);
+
+my %word2sym = ('' => '');
+my %sym2word = ('' => '');
+my $sym_acc = 'a';      # symbols are represented using IDs from this counter
+my $msgcount = 0;
+
+my @t_spam = ();
+my @t_ham = ();
+my %spam = ();
+my %ham = ();
+my $stot = 0;
+my $htot = 0;
+my %set_hit = ();
+
+open IN, "<$fh" or die "cannot open ham log $fh";
+while (<IN>) {
+  /^text: (.*)$/ and proc_text($1, \@t_ham, \%ham, \$htot);
+}
+close IN;
+
+open IN, "<$fs" or die "cannot open spam log $fs";
+while (<IN>) {
+  /^text: (.*)$/ and proc_text($1, \@t_spam, \%spam, \$stot);
+}
+close IN;
+
+summarise();
+exit;
+
+
+sub proc_text {
+  my ($text, $tary, $target, $ttotref) = @_;
+
+  if (length($text) > $MAX_TEXT_IN_MESSAGE) {
+    $text = substr $text, 0, $MAX_TEXT_IN_MESSAGE;      # chop!
+  }
+  push @{$tary}, $text;
+
+  my $cp = pack "l", $msgcount;
+  $msgcount++;
+
+  my $w1 = '';
+  my $w2 = '';
+  my $w3 = '';
+
+  my %tokens = ();
+  foreach my $w (split(' ', $text)) {
+    # if (length $w > 20) { $w = "sk:".substr($w, 0, 5); }
+
+    $w3 = $w2;
+    $w2 = $w1;
+
+    $w1 = $word2sym{$w};
+    if (!$w1) {
+      $word2sym{$w} = $w1 = $sym_acc;
+      $sym2word{$sym_acc} = $w;
+      $sym_acc++;
+    }
+
+    # simple bayesian N-grams to start
+    $tokens{"$w3.$w2.$w1"} = $tokens{"$w3.$w2"} = 1;
+  }
+
+  foreach my $tok (keys %tokens) {
+    $target->{$tok}++;
+    $set_hit{$tok} .= $cp;          # the message subset hit by this tok
+  }
+  $$ttotref++;
+}
+
+sub summarise {
+  foreach my $id (keys %spam) {
+    $set_hit{$id} = unpack("%32C*", $set_hit{$id}); # hash
+  }
+  # note: we don't care about stuff that appears only in ham
+
+  $htot ||= 0.000001;
+  $stot ||= 0.000001;
+
+  my %all_patterns_for_set = ();
+  my %so = ();
+
+  foreach my $id (keys %spam) {
+    my $ham = ($ham{$id} || 0) / $htot;
+    my $spam = ($spam{$id} || 0) / $stot;
+    my $t = $ham + $spam || 0.000001;
+    my $so = $spam / $t;
+
+    my $bad;
+    # only collapse sets for 1.0 S/O rules
+    if ($so != 1.0) {
+      $bad++;
+    }
+    # and must occur more than once!
+    elsif ($spam{$id} <= 1) {
+      $bad++;
+    }
+    # require N% spam hits
+    elsif (($spam{$id}*100) / $stot < $REQUIRE_PERCENT_SPAM_HITS) {
+      $bad++;
+    }
+
+    if ($bad) {
+      # we don't need to remember anything about this pattern after here
+      delete $ham{$id};
+      delete $spam{$id};
+      delete $set_hit{$id};
+      next;
+    }
+
+    $so{$id} = $so;       # since we only list 1.0 S/Os, this is irrelevant
+    my $set = $set_hit{$id};
+    $all_patterns_for_set{$set} ||= [];
+    push @{$all_patterns_for_set{$set}}, decode_sym2words($id);
+  }
+
+  my %done_set = ();
+
+  printf ("%6s  %6s  %6s   %s\n", "RATIO", "SPAM%", "HAM%", "DATA");
+  foreach my $id (sort {
+                      # $so{$a} <=> $so{$b} ||
+                      $spam{$a} <=> $spam{$b}
+                      # || $ham{$b} <=> $ham{$a}
+                  } keys %so)
+  {
+    my $set = $set_hit{$id};
+    next if $done_set{$set}; $done_set{$set}++;
+
+    # we now have several patterns.  see if we can expand them sideways
+    # to make the pattern bigger, and collapse into a smaller number of
+    # pats at the same time
+    my $pats = collapse_pats($all_patterns_for_set{$set});
+    # my $pats = collapse_pats_basic($all_patterns_for_set{$set});
+
+    printf "%6.3f  %6.3f  %6.3f  %s\n",
+        $so{$id}, ($spam{$id}*100) / $stot, (($ham{$id}||0)*100) / $htot,
+        $pats;
+  }
+}
+
+sub decode_sym2words {
+  my $ids = shift;
+  my $r;
+  if ($ids =~ /^(.*)\.(.*)\.(.*)$/) {
+    $r = "$sym2word{$1} $sym2word{$2} $sym2word{$3}";
+  }
+  elsif ($ids =~ /^(.*)\.(.*)$/) {
+    $r = "$sym2word{$1} $sym2word{$2}";
+  }
+  $r =~ s/^\s+//;
+  return $r;
+}
+
+sub collapse_pats_basic {
+  return '/'. join ('/, /', map { s/\//[SLASH]/gs; $_; } @{$_[0]}). '/';
+}
+
+sub collapse_pats {
+  my $pataryref = $_[0];
+  my @ret = ();
+
+  while (1) {
+    my $pat = shift(@{$pataryref});
+    last unless defined($pat);
+
+    # warn "JMD $pat";
+    $pat =~ s/^\s+//;
+
+    my @hits = grep /\Q$pat\E/, @t_spam;
+    if (scalar @hits == 0) {
+      warn "supposed pattern /$pat/ is 0-hitter";
+      push @ret, "[*]$pat";
+      next;
+    }
+
+    # we don't have all day!
+    my $pat_maxlen = 32768;
+    my $s = $hits[0];
+
+    # Now, expand the pattern using a BLAST-style algorithm
+
+    # expand towards start of string
+    while (1) {
+      my $l = length($pat);
+      last if ($l > $pat_maxlen);     # too long
+
+      my $found;
+      # this is too slow
+      if ($s =~ /(.)\Q$pat\E/s) { $found = $1; }
+
+      # this is faster, since it doesn't start with a (.)
+      # if ($s =~ /\Q$pat\E/s) { $found = substr($s, pos($s) - $l, 1); }
+
+      if (!defined $found) {
+        # start of string.  break
+        last;
+      }
+
+      # give up if there are a differing number of hits for the new pat
+      my $newpat = $found.$pat;
+      if (scalar (grep /\Q$newpat\E/, @t_spam) != scalar @hits) { last; }
+
+      $pat = $newpat;     # and carry on
+    }
+    # warn "JMD $pat";
+
+    # expand towards end of string
+    while (1) {
+      if (length($pat) > $pat_maxlen || $s !~ /\Q$pat\E(.)/s) {
+        # end of string.  break
+        last;
+      }
+
+      my $newpat = $pat.$1;
+      if (scalar (grep /\Q$newpat\E/, @t_spam) != scalar @hits) { last; }
+
+      $pat = $newpat;     # and carry on
+    }
+    # warn "JMD $pat";
+
+    # now remove subsumed patterns
+    @{$pataryref} = grep { $pat !~ /\Q$_\E/s } @{$pataryref};
+
+    # warn "JMD $pat";
+    # skip recording this if it's already inside one of the results
+    next if grep { $_ =~ /\Q$pat\E/s } @ret;
+
+    # also, remove cases where this pattern contains previous results
+    @ret = grep { $pat !~ /\Q$_\E/s } @ret;
+
+    # warn "JMD $pat";
+    push (@ret, $pat);
+  }
+
+  # TODO: http://en.wikipedia.org/wiki/Needleman-Wunsch_algorithm
+
+  return '/'.join ('/, /', map { s/\//\\\//gs; $_; } @ret).'/';
+}
+
+

Propchange: spamassassin/trunk/masses/rule-dev/phrase-extract-in-log
------------------------------------------------------------------------------
    svn:executable = *

Added: spamassassin/trunk/masses/rule-dev/seek-phrases-in-corpus
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/rule-dev/seek-phrases-in-corpus?view=auto&rev=498601
==============================================================================
--- spamassassin/trunk/masses/rule-dev/seek-phrases-in-corpus (added)
+++ spamassassin/trunk/masses/rule-dev/seek-phrases-in-corpus Mon Jan 22 05:04:40 2007
@@ -0,0 +1,124 @@
+#!/usr/bin/perl
+
+=head1 NAME
+
+seek-phrases-in-corpus - given a corpus of spam, seek out common phrases
+
+=head1 SYNOPSIS
+
+seek-phrases-in-corpus [--grep 'pattern'] ham:dir:/path spam:dir:/path2 ...
+
+=head1 DESCRIPTION
+
+Given a _small_ corpus of ham and spam mails (specified in mass-check format),
+this will attempt to find patterns that appear in at least 2 spams, then list
+out all the patterns that have a 1.0 S/O ratio (ie. hit spam and no ham).
+
+The output format looks like:
+
+ 1.000   8.633   0.000  /pattern/, /pattern2/, /pattern3/
+ 1.000   8.633   0.000  /pattern4/
+ 1.000  10.000   0.000  /pattern5/
+
+First field is S/O (and will always be 1.000).  Second, the SPAM%
+figure -- how much of the spam corpus, as a percentage, contains the
+pattern.  Third is the list of one or more pattern(s) that hit this
+subset of messages.
+
+Note that patterns that hit a different subset of the messages in the spam
+corpus, are listed on separate lines; e.g., in the example above, /pattern3/
+and /pattern4/ both hit 8.633% of the spam corpus -- however, they hit a
+different 8.633%, not the same subset of messages.  On the other hand,
+/pattern2/ and /pattern3/ are hitting exactly the same messages.
+
+The patterns are simple substrings, not regular expressions; don't
+be misled by the use of "/" as a delimiter.  The body text is rendered
+as SpamAssassin "body" rendering.
+
+=cut
+
+# <@LICENSE>
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at:
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </...@LICENSE>
+
+# ---------------------------------------------------------------------------
+
+use Getopt::Long;
+use Carp qw(croak);
+use FindBin;
+
+use strict;
+use warnings;
+
+my $sadir = "$FindBin::Bin/../..";
+my $tmpdir = "/tmp/findpats.tmp.$$";
+
+my %opt = ();
+GetOptions(
+  'grep=s'       => \$opt{grep},
+) or die "see perldoc for usage";
+
+my $mcargs = join(' ', @ARGV);
+
+# extract just the ham or spam targets
+my $mcargs_h = $mcargs; $mcargs_h =~ s/\bspam:\S+\b//gs;
+my $mcargs_s = $mcargs; $mcargs_s =~ s/\bham:\S+\b//gs;
+
+if ($mcargs_h !~ /\bham:/) {
+  die "seek-phrases-in-corpus: no 'ham:type:path' corpus specifier found!\n";
+}
+if ($mcargs_s !~ /\bspam:/) {
+  die "seek-phrases-in-corpus: no 'spam:type:path' corpus specifier found!\n";
+}
+
+my $re = $opt{grep};
+
+# ---------------------------------------------------------------------------
+
+(-d "$tmpdir/cor") and run ("rm -rf $tmpdir/cor");
+(-d "$tmpdir/cor") or run ("mkdir -p $tmpdir/cor");
+
+# note: -c=/dev/null so no rules ever run
+# don't grep the ham set!
+run("cd $sadir/masses && ".
+    "./mass-check --cf='loadplugin Dumptext plugins/Dumptext.pm' ".
+    " --cf='loadplugin GrepRenderedBody plugins/GrepRenderedBody.pm' ".
+    " -n -o --showdots -c=/dev/null ".
+    " $mcargs_h > $tmpdir/w.h");
+
+# *do* grep the spam, though
+run("cd $sadir/masses && ".
+    "./mass-check --cf='loadplugin Dumptext plugins/Dumptext.pm' ".
+    " --cf='loadplugin GrepRenderedBody plugins/GrepRenderedBody.pm' ".
+    ($re ? " --cf='grep $re' " : "").
+    " -n -o --showdots -c=/dev/null ".
+    " $mcargs_s > $tmpdir/w.s");
+
+run("perl -w $sadir/masses/rule-dev/phrase-extract-in-log ".
+        "$tmpdir/w.h $tmpdir/w.s > $tmpdir/result");
+
+run("cat $tmpdir/result");
+exit;
+
+# ---------------------------------------------------------------------------
+
+sub run {
+  my $cmd = shift;
+  warn "[$cmd]\n";
+  system $cmd;
+  ($? >> 8 != 0) and Carp::croak("command failed");
+}
+

Propchange: spamassassin/trunk/masses/rule-dev/seek-phrases-in-corpus
------------------------------------------------------------------------------
    svn:executable = *