You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by ax...@apache.org on 2017/03/09 16:04:32 UTC
svn commit: r1786195 -
/spamassassin/trunk/masses/rule-dev/seek-phrases-in-log
Author: axb
Date: Thu Mar 9 16:04:32 2017
New Revision: 1786195
URL: http://svn.apache.org/viewvc?rev=1786195&view=rev
Log: (empty)
Modified:
spamassassin/trunk/masses/rule-dev/seek-phrases-in-log
Modified: spamassassin/trunk/masses/rule-dev/seek-phrases-in-log
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/rule-dev/seek-phrases-in-log?rev=1786195&r1=1786194&r2=1786195&view=diff
==============================================================================
--- spamassassin/trunk/masses/rule-dev/seek-phrases-in-log (original)
+++ spamassassin/trunk/masses/rule-dev/seek-phrases-in-log Thu Mar 9 16:04:32 2017
@@ -27,14 +27,14 @@ seek-phrases-in-log - extract good-looki
sub usage {
die "
-usage: seek-phrases-in-log [--reqhitrate n] [--reqpatlength n]
+usage: seek-phrases-in-log [--reqhitrate n] [--reqpatlength n] [ --maxreqpatlength n]
[--rules] [--ruletype 'type'] [--ruleprefix FOO]
[--maxtextread n] --ham hamlog --spam spamlog
--reqhitrate: percentage hit-rate against spam required (default: 0.5)
(multiple values can be specified, separated by spaces)
--reqpatlength: required pattern length, in characters (default: 0)
---maxreqpatlength: maximum pattern length, in characters (default: 2048)
+--maxreqpatlength: maximum pattern length, in characters (default: 1024)
--maxtextread: bytes of message text examined (default: 32768)
--rules: generate SpamAssassin rule output (default: 0)
--ruleprefix: specify prefix string for rules (default: 'SEEK_')
@@ -59,6 +59,7 @@ sub logmsg;
my %opt = ();
$opt{reqhitrate} = 0.5;
$opt{reqpatlength} = 0;
+$opt{maxreqpatlength} = 1024;
$opt{maxtextread} = 32768;
$opt{rules} = 0;
$opt{ruleprefix} = 'SEEK_';
@@ -72,6 +73,7 @@ GetOptions(
"ruleprefix=s" => \$opt{ruleprefix},
"reqhitrate=s" => \$opt{reqhitrate},
"reqpatlength=s" => \$opt{reqpatlength},
+ "maxreqpatlength=s" => \$opt{maxreqpatlength},
"ruletype=s" => \$opt{ruletype},
"maxtextread=s" => \$opt{maxtextread},
"phase2=s" => \$opt{phase2},
@@ -152,7 +154,7 @@ sub proc_text_spam {
$text = substr $text, 0, $opt{maxtextread}; # chop!
}
- $text =~ s/ +/ /gs; # single spaces, please
+ $text =~ s/ +/ /gs; # single spaces, please
# we only need to save spam samples in memory, ignore ham samples
push @text_string, $text;
@@ -302,7 +304,7 @@ sub filter_into_message_subsets {
}
logmsg "message subsets found: ".(scalar
- keys %{$asmstate->{all_patterns_for_set}});
+ keys %{$asmstate->{all_patterns_for_set}});
$asmstate->{ngram_count} = \%ngram_count;
$asmstate->{msg_subset_hit} = \%msg_subset_hit;
@@ -365,8 +367,8 @@ sub assemble_regexps {
my $count = 0;
my $count_out = 0;
foreach my $id (sort {
- $asmstate->{ngram_count}->{$b} <=> $asmstate->{ngram_count}->{$a}
- } keys %{$asmstate->{ngram_count}})
+ $asmstate->{ngram_count}->{$b} <=> $asmstate->{ngram_count}->{$a}
+ } keys %{$asmstate->{ngram_count}})
{
my $set = $asmstate->{msg_subset_hit}->{$id};
next if $done_set{$set}; $done_set{$set}++;
@@ -393,12 +395,12 @@ sub assemble_regexps {
foreach my $pat (@pats) {
my $subsumed = 0;
foreach my $done (@done_pats, @pats_new) {
- # pattern == existing pattern, or existing pattern is contained by
- # pattern, or pattern is contained in existing pattern
+ # pattern == existing pattern, or existing pattern is contained by
+ # pattern, or pattern is contained in existing pattern
if ($pat eq $done || $pat =~ /\Q${done}\E/ || $done =~ /\Q${pat}\E/)
- { $subsumed=1; last; }
- # or one pattern contains the other (but interpreted as a regexp!)
- # this deals with /foo.{0,10} bar/ vs /foo ish bar/
+ { $subsumed=1; last; }
+ # or one pattern contains the other (but interpreted as a regexp!)
+ # this deals with /foo.{0,10} bar/ vs /foo ish bar/
if ($pat =~ /$done/) { $subsumed=1; last; }
if ($done =~ /$pat/) { $subsumed=1; last; }
}
@@ -436,15 +438,15 @@ sub assemble_regexps {
foreach my $pat (sort @pats) {
my $name = generate_rule_name($pat);
- if ($opt{ruletype} eq 'header') {
- # deal with header-specific munging.
- # "\[\\n\]" is the result of "[\n]", at this stage
- $pat =~ s/\Q\[\\n\]\E/\\n/gs;
- $pat =~ s/\Q\[\\t\]\E/\\t/gs;
- }
+ if ($opt{ruletype} eq 'header') {
+ # deal with header-specific munging.
+ # "\[\\n\]" is the result of "[\n]", at this stage
+ $pat =~ s/\Q\[\\n\]\E/\\n/gs;
+ $pat =~ s/\Q\[\\t\]\E/\\t/gs;
+ }
print "$opt{ruletype} $opt{ruleprefix}${name} /$pat/\n";
- $count_out++;
+ $count_out++;
}
} else {
@@ -672,8 +674,8 @@ sub expand_with_dots {
sub ensure_reqpatlength {
my @ret = @_;
if ($opt{reqpatlength}) {
- @ret = grep { (length($_) >= $opt{reqpatlength}) && (length($_) < $opt{maxreqpatlength}) } @ret;
- return () unless @ret;
+ @ret = grep { (length($_) >= $opt{reqpatlength}) && (length($_) < $opt{maxreqpatlength}) } @ret;
+ return () unless @ret;
}
return @ret;
}