You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2007/07/20 17:16:47 UTC
svn commit: r558018 - /spamassassin/trunk/masses/rule-dev/seek-phrases-in-log
Author: jm
Date: Fri Jul 20 08:16:46 2007
New Revision: 558018
URL: http://svn.apache.org/viewvc?view=rev&rev=558018
Log:
improve memory performance of rule-seeker by discarding 1-hit (hapax) entries every 1000 messages scanned; ensure more chars are quoted correctly in the output; and don't allocate any memory when processing ham for better speed
Modified:
spamassassin/trunk/masses/rule-dev/seek-phrases-in-log
Modified: spamassassin/trunk/masses/rule-dev/seek-phrases-in-log
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/rule-dev/seek-phrases-in-log?view=diff&rev=558018&r1=558017&r2=558018
==============================================================================
--- spamassassin/trunk/masses/rule-dev/seek-phrases-in-log (original)
+++ spamassassin/trunk/masses/rule-dev/seek-phrases-in-log Fri Jul 20 08:16:46 2007
@@ -108,6 +108,8 @@
my $cp = pack "l", $msgcount;
$msgcount++;
+ (($msgcount % 1000) == 999) and discard_hapaxes();
+
my $w1 = '';
my $w2 = '';
my $w3 = '';
@@ -142,6 +144,19 @@
}
}
+sub discard_hapaxes {
+ my $killed = 0;
+ foreach my $tok (keys %ngram_count) {
+ if ($ngram_count{$tok} == 1) {
+ delete $ngram_count{$tok};
+ delete $msg_subset_hit{$tok};
+ $killed++;
+ }
+ }
+ warn "shrunk dbs: $killed hapaxes killed, kept ".
+ (scalar keys %ngram_count)." entries\n";
+}
+
sub proc_text_ham {
my ($text) = @_;
@@ -159,7 +174,7 @@
# since we're deleting, there's no need to add new words
# to the dictionary; just use the final $sym_acc to mean
# "unknown ham word", and don't increment it
- $word2sym{$w} = $w1 = $sym_acc;
+ $w1 = $sym_acc;
}
$tokens{"$w3.$w2.$w1"} = $tokens{"$w3.$w2"} = 1;
@@ -236,7 +251,7 @@
# to make the pattern bigger, and collapse into a smaller number of
# pats at the same time
my @pats = collapse_pats($all_patterns_for_set{$set});
- # my @pats = collapse_pats_basic($all_patterns_for_set{$set});
+ # my @pats = @{$all_patterns_for_set{$set}};
# now check to see if any of these pats have been subsumed in an
# already-output pattern (one with more hits!)
@@ -260,7 +275,7 @@
1.0, ($ngram_count{$id}*100) / $msg_count_spam, 0;
foreach my $pat (@pats) {
- $pat =~ s/([\\\/\.\(\)\[\]\+\*\@\%\$])/\\$1/gs;
+ $pat =~ s/([!-+\`\^\~\\\/\|\.\(\)\[\]\@])/\\$1/gs;
my $name = generate_rule_name($pat);
print "body SEEK_${name} /$pat/\n";
}
@@ -287,10 +302,6 @@
}
$r =~ s/^\s+//;
return $r;
-}
-
-sub collapse_pats_basic {
- return @{$_[0]};
}
sub collapse_pats {