You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by ax...@apache.org on 2017/03/09 16:04:32 UTC

svn commit: r1786195 - /spamassassin/trunk/masses/rule-dev/seek-phrases-in-log

Author: axb
Date: Thu Mar  9 16:04:32 2017
New Revision: 1786195

URL: http://svn.apache.org/viewvc?rev=1786195&view=rev
Log: (empty)

Modified:
    spamassassin/trunk/masses/rule-dev/seek-phrases-in-log

Modified: spamassassin/trunk/masses/rule-dev/seek-phrases-in-log
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/rule-dev/seek-phrases-in-log?rev=1786195&r1=1786194&r2=1786195&view=diff
==============================================================================
--- spamassassin/trunk/masses/rule-dev/seek-phrases-in-log (original)
+++ spamassassin/trunk/masses/rule-dev/seek-phrases-in-log Thu Mar  9 16:04:32 2017
@@ -27,14 +27,14 @@ seek-phrases-in-log - extract good-looki
 
 sub usage {
   die "
-usage: seek-phrases-in-log [--reqhitrate n] [--reqpatlength n]
+usage: seek-phrases-in-log [--reqhitrate n] [--reqpatlength n] [ --maxreqpatlength n]
    [--rules] [--ruletype 'type'] [--ruleprefix FOO]
    [--maxtextread n] --ham hamlog --spam spamlog
 
 --reqhitrate: percentage hit-rate against spam required (default: 0.5)
    (multiple values can be specified, separated by spaces)
 --reqpatlength: required pattern length, in characters (default: 0)
---maxreqpatlength: maximum pattern length, in characters (default: 2048)
+--maxreqpatlength: maximum pattern length, in characters (default: 1024)
 --maxtextread: bytes of message text examined (default: 32768)
 --rules: generate SpamAssassin rule output (default: 0)
 --ruleprefix: specify prefix string for rules (default: 'SEEK_')
@@ -59,6 +59,7 @@ sub logmsg;
 my %opt = ();
 $opt{reqhitrate} = 0.5;
 $opt{reqpatlength} = 0;
+$opt{maxreqpatlength} = 1024;
 $opt{maxtextread} = 32768;
 $opt{rules} = 0;
 $opt{ruleprefix} = 'SEEK_';
@@ -72,6 +73,7 @@ GetOptions(
         "ruleprefix=s" => \$opt{ruleprefix},
         "reqhitrate=s" => \$opt{reqhitrate},
         "reqpatlength=s" => \$opt{reqpatlength},
+        "maxreqpatlength=s" => \$opt{maxreqpatlength},
         "ruletype=s" => \$opt{ruletype},
         "maxtextread=s" => \$opt{maxtextread},
         "phase2=s" => \$opt{phase2},
@@ -152,7 +154,7 @@ sub proc_text_spam {
     $text = substr $text, 0, $opt{maxtextread};      # chop!
   }
 
-  $text =~ s/  +/ /gs;                  # single spaces, please
+  $text =~ s/  +/ /gs;			# single spaces, please
 
   # we only need to save spam samples in memory, ignore ham samples
   push @text_string, $text;
@@ -302,7 +304,7 @@ sub filter_into_message_subsets {
   }
 
   logmsg "message subsets found: ".(scalar
-                                keys %{$asmstate->{all_patterns_for_set}});
+				keys %{$asmstate->{all_patterns_for_set}});
 
   $asmstate->{ngram_count} = \%ngram_count;
   $asmstate->{msg_subset_hit} = \%msg_subset_hit;
@@ -365,8 +367,8 @@ sub assemble_regexps {
   my $count = 0;
   my $count_out = 0;
   foreach my $id (sort {
-              $asmstate->{ngram_count}->{$b} <=> $asmstate->{ngram_count}->{$a}
-          } keys %{$asmstate->{ngram_count}})
+	      $asmstate->{ngram_count}->{$b} <=> $asmstate->{ngram_count}->{$a}
+	  } keys %{$asmstate->{ngram_count}})
   {
     my $set = $asmstate->{msg_subset_hit}->{$id};
     next if $done_set{$set}; $done_set{$set}++;
@@ -393,12 +395,12 @@ sub assemble_regexps {
     foreach my $pat (@pats) {
       my $subsumed = 0;
       foreach my $done (@done_pats, @pats_new) {
-        # pattern == existing pattern, or existing pattern is contained by
-        # pattern, or pattern is contained in existing pattern
+	# pattern == existing pattern, or existing pattern is contained by
+	# pattern, or pattern is contained in existing pattern
         if ($pat eq $done || $pat =~ /\Q${done}\E/ || $done =~ /\Q${pat}\E/)
-                                        { $subsumed=1; last; }
-        # or one pattern contains the other (but interpreted as a regexp!)
-        # this deals with /foo.{0,10} bar/ vs /foo ish bar/
+					{ $subsumed=1; last; }
+	# or one pattern contains the other (but interpreted as a regexp!)
+	# this deals with /foo.{0,10} bar/ vs /foo ish bar/
         if ($pat =~ /$done/) { $subsumed=1; last; }
         if ($done =~ /$pat/) { $subsumed=1; last; }
       }
@@ -436,15 +438,15 @@ sub assemble_regexps {
       foreach my $pat (sort @pats) {
         my $name = generate_rule_name($pat);
 
-        if ($opt{ruletype} eq 'header') {
-          # deal with header-specific munging.
-          # "\[\\n\]" is the result of "[\n]", at this stage
-          $pat =~ s/\Q\[\\n\]\E/\\n/gs;
-          $pat =~ s/\Q\[\\t\]\E/\\t/gs;
-        }
+	if ($opt{ruletype} eq 'header') {
+	  # deal with header-specific munging.
+	  # "\[\\n\]" is the result of "[\n]", at this stage
+	  $pat =~ s/\Q\[\\n\]\E/\\n/gs;
+	  $pat =~ s/\Q\[\\t\]\E/\\t/gs;
+	}
 
         print "$opt{ruletype} $opt{ruleprefix}${name}  /$pat/\n";
-        $count_out++;
+	$count_out++;
       }
 
     } else {
@@ -672,8 +674,8 @@ sub expand_with_dots {
 sub ensure_reqpatlength {
   my @ret = @_;
   if ($opt{reqpatlength}) {
-        @ret = grep { (length($_) >= $opt{reqpatlength}) && (length($_) < $opt{maxreqpatlength}) } @ret;
-        return () unless @ret;
+    @ret = grep { (length($_) >= $opt{reqpatlength}) && (length($_) < $opt{maxreqpatlength}) } @ret;
+    return () unless @ret;
   }
   return @ret;
 }