You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2008/12/29 11:50:17 UTC
svn commit: r729859 - /spamassassin/trunk/masses/rule-dev/seek-phrases-in-log

Author: jm
Date: Mon Dec 29 02:50:16 2008
New Revision: 729859

URL: http://svn.apache.org/viewvc?rev=729859&view=rev
Log:
bug: some FPing tokens would not be removed if they occured in FPs at the start of a 'paragraph'. fix

Modified:
    spamassassin/trunk/masses/rule-dev/seek-phrases-in-log

Modified: spamassassin/trunk/masses/rule-dev/seek-phrases-in-log
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/rule-dev/seek-phrases-in-log?rev=729859&r1=729858&r2=729859&view=diff
==============================================================================
--- spamassassin/trunk/masses/rule-dev/seek-phrases-in-log (original)
+++ spamassassin/trunk/masses/rule-dev/seek-phrases-in-log Mon Dec 29 02:50:16 2008
@@ -185,6 +185,7 @@
   }
 
   foreach my $tok (keys %tokens) {
+    #warn "JMD adding $tok => ".decode_sym2words($tok);
     # incr the counter for this token
     $ngram_count{$tok}++;
     $msg_subset_hit{$tok} .= $cp;    # the message subset hit by this tok
@@ -207,39 +208,52 @@
 sub proc_text_ham {
   my ($text) = @_;
 
-  my $w1 = '';
-  my $w2 = '';
-  my $w3 = '';
-
   my %tokens = ();
-  foreach my $w (split(' ', $text)) {
-    $w3 = $w2;
-    $w2 = $w1;
-
-    $w1 = $word2sym{$w};
-    if (!$w1) {
-      # since we're deleting, there's no need to add new words
-      # to the dictionary; just use the final $sym_acc to mean
-      # "unknown ham word", and don't increment it
-      $w1 = $sym_acc;
-    }
-
-    $tokens{"$w3.$w2.$w1"} = $tokens{"$w3.$w2"} = 1;
-  }
-  
-  if ($w2 && $w1) {
-    $tokens{"$w2.$w1"} = 1;
+  foreach my $line (split(/\[p\]/, $text)) {
+    my $w1 = '';
+    my $w2 = '';
+    my $w3 = '';
+
+    foreach my $w (split(' ', $line)) {
+      $w3 = $w2;
+      $w2 = $w1;
+
+      $w1 = $word2sym{$w};
+      if (!$w1) {
+        # since we're deleting, there's no need to add new words
+        # to the dictionary; just use the final $sym_acc to mean
+        # "unknown ham word", and don't increment it
+        $w1 = $sym_acc;
+      }
+
+      $tokens{"$w3.$w2.$w1"} = $tokens{"$w3.$w2"} = $tokens{".$w3"} = 1;
+    }
+    
+    # deal with leftovers
+    if ($w2 && $w1) {
+      $tokens{"$w2.$w1"} = 1;
+    }
   }
 
   foreach my $tok (keys %tokens) {
     # we're not tracking hits; we're killing false positives. 
     # as soon as a single FP appears, kill all record of that token,
     # it cannot be used
-    delete $ngram_count{$tok};
-    delete $msg_subset_hit{$tok};
+    remove_fp_ing_token($tok);
   }
 }
 
+sub remove_fp_ing_token {
+  my $tok = shift;
+  #warn "JMD removing $tok => ".decode_sym2words($tok);
+  delete $ngram_count{$tok};
+  delete $msg_subset_hit{$tok};
+  delete $ngram_count{".".$tok};
+  delete $msg_subset_hit{".".$tok};
+  delete $ngram_count{"..".$tok};
+  delete $msg_subset_hit{"..".$tok};
+}
+
 sub filter_into_message_subsets {
   logmsg "filtering into message subsets...";
 
@@ -262,7 +276,7 @@
     my $bad;
 
     # must occur more than once!
-    if ($count <= 1) {
+    if (!defined $count || $count <= 1) {
       $bad++;
     }
     # require N% spam hits
@@ -272,8 +286,7 @@
 
     if ($bad) {
       # we don't need to remember anything about this pattern after here
-      delete $ngram_count{$id};
-      delete $msg_subset_hit{$id};
+      remove_fp_ing_token($id);
       next;
     }
 
@@ -444,7 +457,7 @@
     my $pat = shift(@{$pataryref});
     last unless defined($pat);
 
-    # warn "JMD $pat";
+    #warn "JMD collapse [$pat]";
     $pat =~ s/^\s+//;
 
     # TODO: optimise, second-slowest line