You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2007/02/02 19:31:21 UTC
svn commit: r502687 - in /spamassassin/trunk/masses/rule-dev:
phrase-extract-in-log seek-phrases-in-corpus
Author: jm
Date: Fri Feb 2 10:31:20 2007
New Revision: 502687
URL: http://svn.apache.org/viewvc?view=rev&rev=502687
Log:
reduce RAM usage in rule-seeker script; there's no need to track ham hits at all, just kill any potential rules as soon as a single ham is hit (and free up its memory)
Modified:
spamassassin/trunk/masses/rule-dev/phrase-extract-in-log
spamassassin/trunk/masses/rule-dev/seek-phrases-in-corpus
Modified: spamassassin/trunk/masses/rule-dev/phrase-extract-in-log
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/rule-dev/phrase-extract-in-log?view=diff&rev=502687&r1=502686&r2=502687
==============================================================================
--- spamassassin/trunk/masses/rule-dev/phrase-extract-in-log (original)
+++ spamassassin/trunk/masses/rule-dev/phrase-extract-in-log Fri Feb 2 10:31:20 2007
@@ -25,12 +25,14 @@
# ---------------------------------------------------------------------------
-use warnings;
-use strict;
+my $MAX_TEXT_IN_MESSAGE = 32678; # bytes of message text examined
+
+my $REQUIRE_PERCENT_SPAM_HITS = 0.5; # % hitrate reqd to list
-my $MAX_TEXT_IN_MESSAGE = 32678; # bytes of message examined
+# ---------------------------------------------------------------------------
-my $REQUIRE_PERCENT_SPAM_HITS = 1; # 1% hitrate reqd
+use warnings;
+use strict;
my $fh = shift @ARGV;
my $fs = shift @ARGV;
@@ -41,23 +43,21 @@
my $sym_acc = 'a'; # symbols are represented using IDs from this counter
my $msgcount = 0;
-my @t_spam = ();
-my @t_ham = ();
-my %spam = ();
-my %ham = ();
-my $stot = 0;
-my $htot = 0;
-my %set_hit = ();
+my @text_string = ();
+my %ngram_count = ();
+my %msg_subset_hit = ();
-open IN, "<$fh" or die "cannot open ham log $fh";
+warn "reading $fs...\n";
+open IN, "<$fs" or die "cannot open spam log $fs";
while (<IN>) {
- /^text: (.*)$/ and proc_text($1, \@t_ham, \%ham, \$htot);
+ /^text: (.*)$/ and proc_text(1, $1);
}
close IN;
-open IN, "<$fs" or die "cannot open spam log $fs";
+warn "reading $fh...\n";
+open IN, "<$fh" or die "cannot open ham log $fh";
while (<IN>) {
- /^text: (.*)$/ and proc_text($1, \@t_spam, \%spam, \$stot);
+ /^text: (.*)$/ and proc_text(0, $1);
}
close IN;
@@ -66,12 +66,16 @@
sub proc_text {
- my ($text, $tary, $target, $ttotref) = @_;
+ my ($adding, $text) = @_;
if (length($text) > $MAX_TEXT_IN_MESSAGE) {
$text = substr $text, 0, $MAX_TEXT_IN_MESSAGE; # chop!
}
- push @{$tary}, $text;
+
+ # we only need to save spam samples in memory, ignore ham samples
+ if ($adding) {
+ push @text_string, $text;
+ }
my $cp = pack "l", $msgcount;
$msgcount++;
@@ -99,54 +103,57 @@
}
foreach my $tok (keys %tokens) {
- $target->{$tok}++;
- $set_hit{$tok} .= $cp; # the message subset hit by this tok
+ if ($adding) {
+ # incr the counter for this token
+ $ngram_count{$tok}++;
+ $msg_subset_hit{$tok} .= $cp; # the message subset hit by this tok
+
+ } else {
+ # we're not tracking hits; we're killing false positives.
+ # as soon as a single FP appears, kill all record of that token,
+ # it cannot be used
+ delete $ngram_count{$tok};
+ delete $msg_subset_hit{$tok};
+ }
}
- $$ttotref++;
}
sub summarise {
- foreach my $id (keys %spam) {
- $set_hit{$id} = unpack("%32C*", $set_hit{$id}); # hash
+ warn "summarizing...\n";
+
+ # hash all msg_subset_hit lists; we don't need the full data, so this
+ # saves space
+ foreach my $id (keys %msg_subset_hit) {
+ $msg_subset_hit{$id} = unpack("%32C*", $msg_subset_hit{$id});
}
- # note: we don't care about stuff that appears only in ham
- $htot ||= 0.000001;
- $stot ||= 0.000001;
+ # note: we don't care about stuff that hits *any* ham at all
+ my $msg_count_spam = scalar @text_string;
+ $msg_count_spam ||= 0.000001;
my %all_patterns_for_set = ();
- my %so = ();
-
- foreach my $id (keys %spam) {
- my $ham = ($ham{$id} || 0) / $htot;
- my $spam = ($spam{$id} || 0) / $stot;
- my $t = $ham + $spam || 0.000001;
- my $so = $spam / $t;
+ foreach my $id (keys %ngram_count) {
+ my $count = $ngram_count{$id};
my $bad;
- # only collapse sets for 1.0 S/O rules
- if ($so != 1.0) {
- $bad++;
- }
- # and must occur more than once!
- elsif ($spam{$id} <= 1) {
+
+ # must occur more than once!
+ if ($count <= 1) {
$bad++;
}
# require N% spam hits
- elsif (($spam{$id}*100) / $stot < $REQUIRE_PERCENT_SPAM_HITS) {
+ elsif (($count*100) / $msg_count_spam < $REQUIRE_PERCENT_SPAM_HITS) {
$bad++;
}
if ($bad) {
# we don't need to remember anything about this pattern after here
- delete $ham{$id};
- delete $spam{$id};
- delete $set_hit{$id};
+ delete $ngram_count{$id};
+ delete $msg_subset_hit{$id};
next;
}
- $so{$id} = $so; # since we only list 1.0 S/Os, this is irrelevant
- my $set = $set_hit{$id};
+ my $set = $msg_subset_hit{$id};
$all_patterns_for_set{$set} ||= [];
push @{$all_patterns_for_set{$set}}, decode_sym2words($id);
}
@@ -155,12 +162,10 @@
printf ("%6s %6s %6s %s\n", "RATIO", "SPAM%", "HAM%", "DATA");
foreach my $id (sort {
- # $so{$a} <=> $so{$b} ||
- $spam{$a} <=> $spam{$b}
- # || $ham{$b} <=> $ham{$a}
- } keys %so)
+ $ngram_count{$a} <=> $ngram_count{$b}
+ } keys %ngram_count)
{
- my $set = $set_hit{$id};
+ my $set = $msg_subset_hit{$id};
next if $done_set{$set}; $done_set{$set}++;
# we now have several patterns. see if we can expand them sideways
@@ -170,8 +175,7 @@
# my $pats = collapse_pats_basic($all_patterns_for_set{$set});
printf "%6.3f %6.3f %6.3f %s\n",
- $so{$id}, ($spam{$id}*100) / $stot, (($ham{$id}||0)*100) / $htot,
- $pats;
+ 1.0, ($ngram_count{$id}*100) / $msg_count_spam, 0, $pats;
}
}
@@ -203,7 +207,7 @@
# warn "JMD $pat";
$pat =~ s/^\s+//;
- my @hits = grep /\Q$pat\E/, @t_spam;
+ my @hits = grep /\Q$pat\E/, @text_string;
if (scalar @hits == 0) {
warn "supposed pattern /$pat/ is 0-hitter";
push @ret, "[*]$pat";
@@ -235,7 +239,7 @@
# give up if there are a differing number of hits for the new pat
my $newpat = $found.$pat;
- if (scalar (grep /\Q$newpat\E/, @t_spam) != scalar @hits) { last; }
+ if (scalar (grep /\Q$newpat\E/, @text_string) != scalar @hits) { last; }
$pat = $newpat; # and carry on
}
@@ -249,7 +253,7 @@
}
my $newpat = $pat.$1;
- if (scalar (grep /\Q$newpat\E/, @t_spam) != scalar @hits) { last; }
+ if (scalar (grep /\Q$newpat\E/, @text_string) != scalar @hits) { last; }
$pat = $newpat; # and carry on
}
Modified: spamassassin/trunk/masses/rule-dev/seek-phrases-in-corpus
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/rule-dev/seek-phrases-in-corpus?view=diff&rev=502687&r1=502686&r2=502687
==============================================================================
--- spamassassin/trunk/masses/rule-dev/seek-phrases-in-corpus (original)
+++ spamassassin/trunk/masses/rule-dev/seek-phrases-in-corpus Fri Feb 2 10:31:20 2007
@@ -71,11 +71,11 @@
'grep=s' => \$opt{grep},
) or die "see perldoc for usage";
-my $mcargs = join(' ', @ARGV);
+my $mcargs = ' '.join(' ', @ARGV).' ';
# extract just the ham or spam targets
-my $mcargs_h = $mcargs; $mcargs_h =~ s/\bspam:\S+\b//gs;
-my $mcargs_s = $mcargs; $mcargs_s =~ s/\bham:\S+\b//gs;
+my $mcargs_h = $mcargs; $mcargs_h =~ s/ spam:\S+ //gs;
+my $mcargs_s = $mcargs; $mcargs_s =~ s/ ham:\S+ //gs;
if ($mcargs_h !~ /\bham:/) {
die "seek-phrases-in-corpus: no 'ham:type:path' corpus specifier found!\n";