You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2007/07/20 17:16:47 UTC
svn commit: r558018 - /spamassassin/trunk/masses/rule-dev/seek-phrases-in-log

Author: jm
Date: Fri Jul 20 08:16:46 2007
New Revision: 558018

URL: http://svn.apache.org/viewvc?view=rev&rev=558018
Log:
improve memory performance of rule-seeker by discarding 1-hit (hapax) entries every 1000 messages scanned; ensure more chars are quoted correctly in the output; and don't allocate any memory when processing ham for better speed

Modified:
    spamassassin/trunk/masses/rule-dev/seek-phrases-in-log

Modified: spamassassin/trunk/masses/rule-dev/seek-phrases-in-log
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/rule-dev/seek-phrases-in-log?view=diff&rev=558018&r1=558017&r2=558018
==============================================================================
--- spamassassin/trunk/masses/rule-dev/seek-phrases-in-log (original)
+++ spamassassin/trunk/masses/rule-dev/seek-phrases-in-log Fri Jul 20 08:16:46 2007
@@ -108,6 +108,8 @@
   my $cp = pack "l", $msgcount;
   $msgcount++;
 
+  (($msgcount % 1000) == 999) and discard_hapaxes();
+
   my $w1 = '';
   my $w2 = '';
   my $w3 = '';
@@ -142,6 +144,19 @@
   }
 }
 
+sub discard_hapaxes {
+  my $killed = 0;
+  foreach my $tok (keys %ngram_count) {
+    if ($ngram_count{$tok} == 1) {
+      delete $ngram_count{$tok};
+      delete $msg_subset_hit{$tok};
+      $killed++;
+    }
+  }
+  warn "shrunk dbs: $killed hapaxes killed, kept ".
+                    (scalar keys %ngram_count)." entries\n";
+}
+
 sub proc_text_ham {
   my ($text) = @_;
 
@@ -159,7 +174,7 @@
       # since we're deleting, there's no need to add new words
       # to the dictionary; just use the final $sym_acc to mean
       # "unknown ham word", and don't increment it
-      $word2sym{$w} = $w1 = $sym_acc;
+      $w1 = $sym_acc;
     }
 
     $tokens{"$w3.$w2.$w1"} = $tokens{"$w3.$w2"} = 1;
@@ -236,7 +251,7 @@
     # to make the pattern bigger, and collapse into a smaller number of
     # pats at the same time
     my @pats = collapse_pats($all_patterns_for_set{$set});
-    # my @pats = collapse_pats_basic($all_patterns_for_set{$set});
+    # my @pats = @{$all_patterns_for_set{$set}};
 
     # now check to see if any of these pats have been subsumed in an
     # already-output pattern (one with more hits!)
@@ -260,7 +275,7 @@
         1.0, ($ngram_count{$id}*100) / $msg_count_spam, 0;
 
       foreach my $pat (@pats) {
-        $pat =~ s/([\\\/\.\(\)\[\]\+\*\@\%\$])/\\$1/gs;
+        $pat =~ s/([!-+\`\^\~\\\/\|\.\(\)\[\]\@])/\\$1/gs;
         my $name = generate_rule_name($pat);
         print "body SEEK_${name}  /$pat/\n";
       }
@@ -287,10 +302,6 @@
   }
   $r =~ s/^\s+//;
   return $r;
-}
-
-sub collapse_pats_basic {
-  return @{$_[0]};
 }
 
 sub collapse_pats {