You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by ax...@apache.org on 2011/07/25 17:26:01 UTC
svn commit: r1150749 - /spamassassin/trunk/masses/rule-dev/seek-phrases-in-log

Author: axb
Date: Mon Jul 25 15:26:00 2011
New Revision: 1150749

URL: http://svn.apache.org/viewvc?rev=1150749&view=rev
Log:
added --maxreqpatlength: maximum pattern length, in characters (default: 2048)

Thanks to Dallas Engelken for the hack.


Modified:
    spamassassin/trunk/masses/rule-dev/seek-phrases-in-log

Modified: spamassassin/trunk/masses/rule-dev/seek-phrases-in-log
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/rule-dev/seek-phrases-in-log?rev=1150749&r1=1150748&r2=1150749&view=diff
==============================================================================
--- spamassassin/trunk/masses/rule-dev/seek-phrases-in-log (original)
+++ spamassassin/trunk/masses/rule-dev/seek-phrases-in-log Mon Jul 25 15:26:00 2011
@@ -34,6 +34,7 @@ usage: seek-phrases-in-log [--reqhitrate
 --reqhitrate: percentage hit-rate against spam required (default: 0.5)
    (multiple values can be specified, separated by spaces)
 --reqpatlength: required pattern length, in characters (default: 0)
+--maxreqpatlength: maximum pattern length, in characters (default: 2048)
 --maxtextread: bytes of message text examined (default: 32768)
 --rules: generate SpamAssassin rule output (default: 0)
 --ruleprefix: specify prefix string for rules (default: 'SEEK_')
@@ -102,7 +103,7 @@ my $asmstate;
 
 if ($opt{phase2}) {
   $asmstate = load_state($opt{phase2});
-} 
+}
 else {
   logmsg "reading $fs...";
   open IN, "<$fs" or die "cannot open spam log $fs";
@@ -151,7 +152,7 @@ sub proc_text_spam {
     $text = substr $text, 0, $opt{maxtextread};      # chop!
   }
 
-  $text =~ s/  +/ /gs;			# single spaces, please
+  $text =~ s/  +/ /gs;                  # single spaces, please
 
   # we only need to save spam samples in memory, ignore ham samples
   push @text_string, $text;
@@ -183,7 +184,7 @@ sub proc_text_spam {
       # simple bayesian N-grams to start
       $tokens{"$w3.$w2.$w1"} = $tokens{"$w3.$w2"} = 1;
     }
-    
+
     # deal with leftovers
     if ($w2 && $w1) {
       $tokens{"$w2.$w1"} = 1;
@@ -234,7 +235,7 @@ sub proc_text_ham {
 
       $tokens{"$w3.$w2.$w1"} = $tokens{"$w3.$w2"} = $tokens{".$w3"} = 1;
     }
-    
+
     # deal with leftovers
     if ($w2 && $w1) {
       $tokens{"$w2.$w1"} = 1;
@@ -242,7 +243,7 @@ sub proc_text_ham {
   }
 
   foreach my $tok (keys %tokens) {
-    # we're not tracking hits; we're killing false positives. 
+    # we're not tracking hits; we're killing false positives.
     # as soon as a single FP appears, kill all record of that token,
     # it cannot be used
     remove_fp_ing_token($tok);
@@ -301,7 +302,7 @@ sub filter_into_message_subsets {
   }
 
   logmsg "message subsets found: ".(scalar
-				keys %{$asmstate->{all_patterns_for_set}});
+                                keys %{$asmstate->{all_patterns_for_set}});
 
   $asmstate->{ngram_count} = \%ngram_count;
   $asmstate->{msg_subset_hit} = \%msg_subset_hit;
@@ -364,8 +365,8 @@ sub assemble_regexps {
   my $count = 0;
   my $count_out = 0;
   foreach my $id (sort {
-	      $asmstate->{ngram_count}->{$b} <=> $asmstate->{ngram_count}->{$a}
-	  } keys %{$asmstate->{ngram_count}})
+              $asmstate->{ngram_count}->{$b} <=> $asmstate->{ngram_count}->{$a}
+          } keys %{$asmstate->{ngram_count}})
   {
     my $set = $asmstate->{msg_subset_hit}->{$id};
     next if $done_set{$set}; $done_set{$set}++;
@@ -392,12 +393,12 @@ sub assemble_regexps {
     foreach my $pat (@pats) {
       my $subsumed = 0;
       foreach my $done (@done_pats, @pats_new) {
-	# pattern == existing pattern, or existing pattern is contained by
-	# pattern, or pattern is contained in existing pattern
+        # pattern == existing pattern, or existing pattern is contained by
+        # pattern, or pattern is contained in existing pattern
         if ($pat eq $done || $pat =~ /\Q${done}\E/ || $done =~ /\Q${pat}\E/)
-					{ $subsumed=1; last; }
-	# or one pattern contains the other (but interpreted as a regexp!)
-	# this deals with /foo.{0,10} bar/ vs /foo ish bar/
+                                        { $subsumed=1; last; }
+        # or one pattern contains the other (but interpreted as a regexp!)
+        # this deals with /foo.{0,10} bar/ vs /foo ish bar/
         if ($pat =~ /$done/) { $subsumed=1; last; }
         if ($done =~ /$pat/) { $subsumed=1; last; }
       }
@@ -435,15 +436,15 @@ sub assemble_regexps {
       foreach my $pat (sort @pats) {
         my $name = generate_rule_name($pat);
 
-	if ($opt{ruletype} eq 'header') {
-	  # deal with header-specific munging.
-	  # "\[\\n\]" is the result of "[\n]", at this stage
-	  $pat =~ s/\Q\[\\n\]\E/\\n/gs;
-	  $pat =~ s/\Q\[\\t\]\E/\\t/gs;
-	}
+        if ($opt{ruletype} eq 'header') {
+          # deal with header-specific munging.
+          # "\[\\n\]" is the result of "[\n]", at this stage
+          $pat =~ s/\Q\[\\n\]\E/\\n/gs;
+          $pat =~ s/\Q\[\\t\]\E/\\t/gs;
+        }
 
         print "$opt{ruletype} $opt{ruleprefix}${name}  /$pat/\n";
-	$count_out++;
+        $count_out++;
       }
 
     } else {
@@ -576,7 +577,7 @@ sub subsume_with_dotstars {
     foreach my $p2 (@working) {
       next if ($p1 eq $p2);
 
-      DOTLOOP: for my $dotstar 
+      DOTLOOP: for my $dotstar
             ( ".", ".?", ".{0,3}", ".{0,5}", ".{0,20}", ".{0,40}" )
       {
         my $newpatcapture = $p1."(".$dotstar.")".$p2;
@@ -671,8 +672,8 @@ sub expand_with_dots {
 sub ensure_reqpatlength {
   my @ret = @_;
   if ($opt{reqpatlength}) {
-    @ret = grep { length($_) >= $opt{reqpatlength} } @ret;
-    return () unless @ret;
+        @ret = grep { (length($_) >= $opt{reqpatlength}) && (length($_) < $opt{maxreqpatlength}) } @ret;
+        return () unless @ret;
   }
   return @ret;
 }