You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by qu...@apache.org on 2005/02/14 11:04:49 UTC
svn commit: r153758 - in spamassassin/trunk/masses: mass-check
parse-rules-for-masses
Author: quinlan
Date: Mon Feb 14 02:04:49 2005
New Revision: 153758
URL: http://svn.apache.org/viewcvs?view=rev&rev=153758
Log:
add --reuse option to mass-check to allow certain network test hits to
be reused out of X-Spam-Status: if they have a "#reuse" line in the .cf
files
add sample errors (based on my personal data for now) for sample learning
Modified:
spamassassin/trunk/masses/mass-check
spamassassin/trunk/masses/parse-rules-for-masses
Modified: spamassassin/trunk/masses/mass-check
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/masses/mass-check?view=diff&r1=153757&r2=153758
==============================================================================
--- spamassassin/trunk/masses/mass-check (original)
+++ spamassassin/trunk/masses/mass-check Mon Feb 14 02:04:49 2005
@@ -64,9 +64,10 @@
Just left over functions we should remove at some point:
--bayes report score from Bayesian classifier
- sample-based learning
+ options used during score generation process
--learn=N learn N% of messages as spam or ham
-
+ --reuse reuse network checks if X-Spam-Status: is present in messages
+
non-option arguments are used as target names (mail files and folders),
the target format is: <class>:<format>:<location>
<class> is "spam" or "ham"
@@ -83,11 +84,13 @@
$opt_mid $opt_net $opt_nosort $opt_progress $opt_showdots
$opt_spamlog $opt_tail $opt_rules $opt_restart $opt_loguris
$opt_logmem $opt_after $opt_before $opt_rewrite $opt_deencap
- $opt_learn
- $total_messages $statusevery);
+ $opt_learn $opt_reuse
+ $total_messages $statusevery
+ %reuse);
use FindBin;
use lib "$FindBin::Bin/../lib";
+use lib "$FindBin::Bin/tmp";
eval "use bytes";
use Mail::SpamAssassin::ArchiveIterator;
use Mail::SpamAssassin;
@@ -96,6 +99,9 @@
use constant HAS_TIME_PARSEDATE => eval { require Time::ParseDate; };
use Config;
+# for reuse, score set doesn't matter
+require "rules.pl";
+
# default settings
$opt_c = "$FindBin::Bin/../rules";
$opt_p = "$FindBin::Bin/spamassassin";
@@ -109,7 +115,7 @@
"hamlog=s", "head=i", "loghits", "mh", "mid", "ms", "net",
"progress", "rewrite:s", "showdots", "spamlog=s", "tail=i",
"rules=s", "restart=i", "after=s", "before=s", "loguris",
- "deencap=s", "logmem", "learn=i",
+ "deencap=s", "logmem", "learn=i", "reuse",
"dir" => sub { $opt_format = "dir"; },
"file" => sub { $opt_format = "file"; },
"mbox" => sub { $opt_format = "mbox"; },
@@ -302,6 +308,19 @@
# remove SpamAssassin markup, if present and the mail was spam
my $header = $ma->get_header("Received");
+ my $x_spam_status;
+ if ($opt_reuse) {
+ # get X-Spam-Status: header for rule hit resue
+ $x_spam_status = $ma->get_header("X-Spam-Status");
+ }
+ # previous hits
+ my @previous;
+ if ($x_spam_status) {
+ $x_spam_status =~ s/,\s+/,/gs;
+ if ($x_spam_status =~ m/tests=(.*)(?:\s|$)/g) {
+ push @previous, split(/,/, $1);
+ }
+ }
if ($header && $header =~ /\bwith SpamAssassin\b/) {
if (!$opt_deencap || message_should_be_deencapped($ma)) {
my $new_ma = $spamtest->parse($spamtest->remove_spamassassin_markup($ma), 1);
@@ -330,9 +349,17 @@
# sample-based learning
if ($opt_learn > 0) {
- # should be changed to be deterministic
- if (rand(100) < $opt_learn) {
- my $spam;
+ my $spam;
+ # spam learned as ham = 0.05%
+ if ($class eq 's' && rand(100) < 0.05) {
+ $spam = 0;
+ }
+ # ham learned as spam = 0.01%
+ elsif ($class eq 'h' && rand(100) < 0.01) {
+ $spam = 1;
+ }
+ # spam/ham learned correctly
+ elsif (rand(100) < $opt_learn) {
if ($class eq 's') {
$spam = 1;
}
@@ -342,6 +369,8 @@
else {
die "unknown class, learning failed";
}
+ }
+ if (defined $spam) {
my $result = ($spam ? "spam" : "ham");
my $status = $spamtest->learn($ma, undef, $spam, 0);
$learned = $status->did_learn();
@@ -393,8 +422,22 @@
$extra = '';
} else {
$yorn = $status->is_spam() ? 'Y' : '.';
+ # don't bother adjusting scores for reuse
$score = $status->get_score();
- $tests = join(",", sort(grep(length,$status->get_names_of_tests_hit(),$status->get_names_of_subtests_hit())));
+ # list of tests hit
+ my @tests;
+ push @tests, split(/,/, $status->get_names_of_tests_hit());
+ push @tests, split(/,/, $status->get_names_of_subtests_hit());
+ # hit reuse
+ if ($x_spam_status) {
+ # generate mapping of hits to remove hits that are marked as skip
+ @tests = grep { !$reuse{$_}->{skip} } @tests;
+ # add hits from previous
+ for (@previous) {
+ push(@tests, $reuse{$_}->{reuse}) if $reuse{$_}->{reuse};
+ }
+ }
+ $tests = join(",", sort(@tests));
$extra = join(",", @extra);
}
Modified: spamassassin/trunk/masses/parse-rules-for-masses
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/masses/parse-rules-for-masses?view=diff&r1=153757&r2=153758
==============================================================================
--- spamassassin/trunk/masses/parse-rules-for-masses (original)
+++ spamassassin/trunk/masses/parse-rules-for-masses Mon Feb 14 02:04:49 2005
@@ -54,6 +54,7 @@
$scoreset = 0 if ( !defined $scoreset );
my $rules = { };
+my $reuse = { };
readrules(@rulesdirs);
my $scores = { };
@@ -85,6 +86,9 @@
$scores_mutable = 1;
}
+ # oh, this is a dirty dirty hack, but we don't need this at runtime
+ s/^#reuse/reuse/;
+
s/#.*$//g; s/^\s+//; s/\s+$//; next if /^$/;
# TODO: this could be overwriting stuff
@@ -125,7 +129,16 @@
}
$rules->{$name}->{score} = $score;
$rules->{$name}->{mutable} = $scores_mutable;
- }
+ } elsif (/^reuse\s+(.*)$/) {
+ my ($new, @old) = split(' ', $1);
+ push @old, $new;
+ for my $old (@old) {
+ $reuse->{$old} ||= { };
+ $reuse->{$old}->{reuse} = $new;
+ }
+ $reuse->{$new} ||= { };
+ $reuse->{$new}->{skip} = 1;
+ }
}
close IN;
}
@@ -163,7 +176,7 @@
print OUT "# dumped at ".`date`."\n";
$Data::Dumper::Purity = 1;
- print OUT Data::Dumper->Dump ([$rules, $scores], ['*rules', '*scores']);
+ print OUT Data::Dumper->Dump ([$rules, $scores, $reuse], ['*rules', '*scores', '*reuse']);
print OUT "1;";
close OUT;