You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2007/07/11 20:15:36 UTC
svn commit: r555355 - in /spamassassin/trunk/masses/rule-dev:
seek-phrases-in-corpus seek-phrases-in-log
Author: jm
Date: Wed Jul 11 11:15:34 2007
New Revision: 555355
URL: http://svn.apache.org/viewvc?view=rev&rev=555355
Log:
new functionality for seek-phrases-in-log: rule generation. also some more QA/filtering on its output
Modified:
spamassassin/trunk/masses/rule-dev/seek-phrases-in-corpus
spamassassin/trunk/masses/rule-dev/seek-phrases-in-log
Modified: spamassassin/trunk/masses/rule-dev/seek-phrases-in-corpus
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/rule-dev/seek-phrases-in-corpus?view=diff&rev=555355&r1=555354&r2=555355
==============================================================================
--- spamassassin/trunk/masses/rule-dev/seek-phrases-in-corpus (original)
+++ spamassassin/trunk/masses/rule-dev/seek-phrases-in-corpus Wed Jul 11 11:15:34 2007
@@ -110,7 +110,7 @@
" $mcargs_s > $tmpdir/w.s");
run("perl -w $sadir/masses/rule-dev/seek-phrases-in-log ".
- "$tmpdir/w.h $tmpdir/w.s > $tmpdir/result");
+ "--ham $tmpdir/w.h --spam $tmpdir/w.s > $tmpdir/result");
run("cat $tmpdir/result");
exit;
Modified: spamassassin/trunk/masses/rule-dev/seek-phrases-in-log
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/rule-dev/seek-phrases-in-log?view=diff&rev=555355&r1=555354&r2=555355
==============================================================================
--- spamassassin/trunk/masses/rule-dev/seek-phrases-in-log (original)
+++ spamassassin/trunk/masses/rule-dev/seek-phrases-in-log Wed Jul 11 11:15:34 2007
@@ -25,18 +25,43 @@
# ---------------------------------------------------------------------------
-my $MAX_TEXT_IN_MESSAGE = 32678; # bytes of message text examined
-
-my $REQUIRE_PERCENT_SPAM_HITS = 0.5; # % hitrate reqd to list
+sub usage {
+ die "
+usage: seek-phrases-in-log [--reqhitrate n] [--reqpatlength n]
+ [--rules] [--maxtextread n] --ham hamlog --spam spamlog
+
+--reqhitrate: percentage hit-rate against spam required (default: 0.5)
+--reqpatlength: required pattern length, in characters (default: 0)
+--maxtextread: bytes of message text examined (default: 32768)
+--rules: generate SpamAssassin rule output (default: 0)
+";
+}
# ---------------------------------------------------------------------------
use warnings;
use strict;
+use Getopt::Long qw(:config no_ignore_case);
-my $fh = shift @ARGV;
-my $fs = shift @ARGV;
-die "usage: phrase-extract-in-log hamlog spamlog" unless ($fs && $fh);
+my %opt = ();
+$opt{reqhitrate} = 0.5;
+$opt{reqpatlength} = 0;
+$opt{maxtextread} = 32768;
+$opt{rules} = 0;
+
+my $fs;
+my $fh;
+my @files = ();
+GetOptions(
+ "rules" => \$opt{rules},
+ "reqhitrate=s" => \$opt{reqhitrate},
+ "reqpatlength=s" => \$opt{reqpatlength},
+ "ham=s" => \$fh,
+ "spam=s" => \$fs,
+ 'help' => \&usage
+) or usage();
+
+usage() unless ($fs && $fh);
my %word2sym = ('' => '');
my %sym2word = ('' => '');
@@ -73,8 +98,8 @@
my ($text) = @_;
# we only need to chop off the end of spam samples
- if (length($text) > $MAX_TEXT_IN_MESSAGE) {
- $text = substr $text, 0, $MAX_TEXT_IN_MESSAGE; # chop!
+ if (length($text) > $opt{maxtextread}) {
+ $text = substr $text, 0, $opt{maxtextread}; # chop!
}
# we only need to save spam samples in memory, ignore ham samples
@@ -177,7 +202,7 @@
$bad++;
}
# require N% spam hits
- elsif (($count*100) / $msg_count_spam < $REQUIRE_PERCENT_SPAM_HITS) {
+ elsif (($count*100) / $msg_count_spam < $opt{reqhitrate}) {
$bad++;
}
@@ -196,11 +221,12 @@
warn "message subsets found: ".(scalar keys %all_patterns_for_set)."\n";
my %done_set = ();
+ my @done_pats = ();
printf ("%6s %6s %6s %s\n", "RATIO", "SPAM%", "HAM%", "DATA");
$| = 1;
foreach my $id (sort {
- $ngram_count{$a} <=> $ngram_count{$b}
+ $ngram_count{$b} <=> $ngram_count{$a}
} keys %ngram_count)
{
my $set = $msg_subset_hit{$id};
@@ -209,11 +235,41 @@
# we now have several patterns. see if we can expand them sideways
# to make the pattern bigger, and collapse into a smaller number of
# pats at the same time
- my $pats = collapse_pats($all_patterns_for_set{$set});
- # my $pats = collapse_pats_basic($all_patterns_for_set{$set});
+ my @pats = collapse_pats($all_patterns_for_set{$set});
+ # my @pats = collapse_pats_basic($all_patterns_for_set{$set});
+
+ # now check to see if any of these pats have been subsumed in an
+ # already-output pattern (one with more hits!)
+ my @pats_new = ();
+ foreach my $pat (@pats) {
+ my $subsumed = 0;
+ foreach my $done (@done_pats) {
+ if ($pat =~ /\Q${done}\E/) { $subsumed=1; last; }
+ }
+ if (!$subsumed) { push @pats_new, $pat; }
+ }
+ @pats = @pats_new;
+
+ # if we have no non-subsumed pats at this stage, skip this set
+ next unless @pats;
+
+ push @done_pats, @pats;
- printf "%6.3f %6.3f %6.3f %s\n",
+ if ($opt{rules}) {
+ printf "# %6.3f %6.3f %6.3f\n",
+ 1.0, ($ngram_count{$id}*100) / $msg_count_spam, 0;
+
+ foreach my $pat (@pats) {
+ $pat =~ s/([\\\/\.\(\)\[\]\+\*\@\%\$])/\\$1/gs;
+ my $name = generate_rule_name($pat);
+ print "body SEEK_${name} /$pat/\n";
+ }
+
+ } else {
+ my $pats = '/'.join ('/, /', map { s/\//\\\//gs; $_; } @pats).'/';
+ printf "%6.3f %6.3f %6.3f %s\n",
1.0, ($ngram_count{$id}*100) / $msg_count_spam, 0, $pats;
+ }
}
}
@@ -234,7 +290,7 @@
}
sub collapse_pats_basic {
- return '/'. join ('/, /', map { s/\//[SLASH]/gs; $_; } @{$_[0]}). '/';
+ return @{$_[0]};
}
sub collapse_pats {
@@ -256,7 +312,7 @@
}
# we don't have all day!
- my $pat_maxlen = $MAX_TEXT_IN_MESSAGE;
+ my $pat_maxlen = $opt{maxtextread};
my $s = $hits[0];
# Now, expand the pattern using a BLAST-style algorithm
@@ -312,7 +368,22 @@
# TODO: http://en.wikipedia.org/wiki/Needleman-Wunsch_algorithm
- return '/'.join ('/, /', map { s/\//\\\//gs; $_; } @ret).'/';
+ if ($opt{reqpatlength}) {
+ @ret = grep { length($_) >= $opt{reqpatlength} } @ret;
+ return unless @ret;
+ }
+
+ return @ret;
+}
+
+sub generate_rule_name {
+ my $str = shift;
+ use Digest::SHA1 qw(sha1_base64);
+ $str = sha1_base64($str);
+ $str =~ s/^(.{6}).*$/$1/gs;
+ $str =~ tr/a-z/A-Z/;
+ $str =~ s/[^A-Z0-9]/_/gs;
+ return $str;
}