You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@spamassassin.apache.org by jm...@apache.org on 2007/07/25 15:27:31 UTC

svn commit: r559451 - /spamassassin/trunk/masses/rule-dev/seek-phrases-in-log

Author: jm
Date: Wed Jul 25 06:27:29 2007
New Revision: 559451

URL: http://svn.apache.org/viewvc?view=rev&rev=559451
Log:
ensure identical-FP/FN-rate rule order remains the same two runs in a row

Modified:
    spamassassin/trunk/masses/rule-dev/seek-phrases-in-log

Modified: spamassassin/trunk/masses/rule-dev/seek-phrases-in-log
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/rule-dev/seek-phrases-in-log?view=diff&rev=559451&r1=559450&r2=559451
==============================================================================
--- spamassassin/trunk/masses/rule-dev/seek-phrases-in-log (original)
+++ spamassassin/trunk/masses/rule-dev/seek-phrases-in-log Wed Jul 25 06:27:29 2007
@@ -82,6 +82,13 @@
   /^text: (.*)$/ and proc_text_spam($1);
 }
 close IN;
+
+# only do this if we have read enough spam messages, otherwise we could
+# discard tokens for which reqhitrate has been achieved
+if ($msgcount > 2 * (100 / $opt{reqhitrate})) {
+  discard_hapaxes();
+}
+
 warn "n-grams active: ".(scalar keys %ngram_count)."\n";
 
 warn "reading $fh...\n";
@@ -151,13 +158,13 @@
 sub discard_hapaxes {
   my $before = (scalar keys %ngram_count);
   foreach my $tok (keys %ngram_count) {
-    if ($ngram_count{$tok} == 1) {
+    if ($ngram_count{$tok} <= 1) {
       delete $ngram_count{$tok};
       delete $msg_subset_hit{$tok};
     }
   }
   my $after = (scalar keys %ngram_count);
-  my $killed = ($after - $before);
+  my $killed = ($before - $after);
   warn "shrunk dbs: $killed hapaxes killed, kept $after entries\n";
 }
 
@@ -281,7 +288,8 @@
       printf "# %6.3f  %6.3f  %6.3f\n",
         1.0, ($ngram_count{$id}*100) / $msg_count_spam, 0;
 
-      foreach my $pat (@pats) {
+      # sort, to ensure ordering always remains the same
+      foreach my $pat (sort @pats) {
         $pat =~ s/([!-+\`\^\~\\\/\|\.\(\)\[\]\@])/\\$1/gs;
         my $name = generate_rule_name($pat);
         print "body $opt{ruleprefix}${name}  /$pat/\n";