You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2008/12/29 11:50:17 UTC
svn commit: r729859 - /spamassassin/trunk/masses/rule-dev/seek-phrases-in-log
Author: jm
Date: Mon Dec 29 02:50:16 2008
New Revision: 729859
URL: http://svn.apache.org/viewvc?rev=729859&view=rev
Log:
bug: some FPing tokens would not be removed if they occured in FPs at the start of a 'paragraph'. fix
Modified:
spamassassin/trunk/masses/rule-dev/seek-phrases-in-log
Modified: spamassassin/trunk/masses/rule-dev/seek-phrases-in-log
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/rule-dev/seek-phrases-in-log?rev=729859&r1=729858&r2=729859&view=diff
==============================================================================
--- spamassassin/trunk/masses/rule-dev/seek-phrases-in-log (original)
+++ spamassassin/trunk/masses/rule-dev/seek-phrases-in-log Mon Dec 29 02:50:16 2008
@@ -185,6 +185,7 @@
}
foreach my $tok (keys %tokens) {
+ #warn "JMD adding $tok => ".decode_sym2words($tok);
# incr the counter for this token
$ngram_count{$tok}++;
$msg_subset_hit{$tok} .= $cp; # the message subset hit by this tok
@@ -207,39 +208,52 @@
sub proc_text_ham {
my ($text) = @_;
- my $w1 = '';
- my $w2 = '';
- my $w3 = '';
-
my %tokens = ();
- foreach my $w (split(' ', $text)) {
- $w3 = $w2;
- $w2 = $w1;
-
- $w1 = $word2sym{$w};
- if (!$w1) {
- # since we're deleting, there's no need to add new words
- # to the dictionary; just use the final $sym_acc to mean
- # "unknown ham word", and don't increment it
- $w1 = $sym_acc;
- }
-
- $tokens{"$w3.$w2.$w1"} = $tokens{"$w3.$w2"} = 1;
- }
-
- if ($w2 && $w1) {
- $tokens{"$w2.$w1"} = 1;
+ foreach my $line (split(/\[p\]/, $text)) {
+ my $w1 = '';
+ my $w2 = '';
+ my $w3 = '';
+
+ foreach my $w (split(' ', $line)) {
+ $w3 = $w2;
+ $w2 = $w1;
+
+ $w1 = $word2sym{$w};
+ if (!$w1) {
+ # since we're deleting, there's no need to add new words
+ # to the dictionary; just use the final $sym_acc to mean
+ # "unknown ham word", and don't increment it
+ $w1 = $sym_acc;
+ }
+
+ $tokens{"$w3.$w2.$w1"} = $tokens{"$w3.$w2"} = $tokens{".$w3"} = 1;
+ }
+
+ # deal with leftovers
+ if ($w2 && $w1) {
+ $tokens{"$w2.$w1"} = 1;
+ }
}
foreach my $tok (keys %tokens) {
# we're not tracking hits; we're killing false positives.
# as soon as a single FP appears, kill all record of that token,
# it cannot be used
- delete $ngram_count{$tok};
- delete $msg_subset_hit{$tok};
+ remove_fp_ing_token($tok);
}
}
+sub remove_fp_ing_token {
+ my $tok = shift;
+ #warn "JMD removing $tok => ".decode_sym2words($tok);
+ delete $ngram_count{$tok};
+ delete $msg_subset_hit{$tok};
+ delete $ngram_count{".".$tok};
+ delete $msg_subset_hit{".".$tok};
+ delete $ngram_count{"..".$tok};
+ delete $msg_subset_hit{"..".$tok};
+}
+
sub filter_into_message_subsets {
logmsg "filtering into message subsets...";
@@ -262,7 +276,7 @@
my $bad;
# must occur more than once!
- if ($count <= 1) {
+ if (!defined $count || $count <= 1) {
$bad++;
}
# require N% spam hits
@@ -272,8 +286,7 @@
if ($bad) {
# we don't need to remember anything about this pattern after here
- delete $ngram_count{$id};
- delete $msg_subset_hit{$id};
+ remove_fp_ing_token($id);
next;
}
@@ -444,7 +457,7 @@
my $pat = shift(@{$pataryref});
last unless defined($pat);
- # warn "JMD $pat";
+ #warn "JMD collapse [$pat]";
$pat =~ s/^\s+//;
# TODO: optimise, second-slowest line