You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by qu...@apache.org on 2005/02/14 11:04:49 UTC

svn commit: r153758 - in spamassassin/trunk/masses: mass-check parse-rules-for-masses

Author: quinlan
Date: Mon Feb 14 02:04:49 2005
New Revision: 153758

URL: http://svn.apache.org/viewcvs?view=rev&rev=153758
Log:
add --reuse option to mass-check to allow certain network test hits to
  be reused out of X-Spam-Status: if they have a "#reuse" line in the .cf
  files
add sample errors (based on my personal data for now) for sample learning

Modified:
    spamassassin/trunk/masses/mass-check
    spamassassin/trunk/masses/parse-rules-for-masses

Modified: spamassassin/trunk/masses/mass-check
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/masses/mass-check?view=diff&r1=153757&r2=153758
==============================================================================
--- spamassassin/trunk/masses/mass-check (original)
+++ spamassassin/trunk/masses/mass-check Mon Feb 14 02:04:49 2005
@@ -64,9 +64,10 @@
   Just left over functions we should remove at some point:
   --bayes       report score from Bayesian classifier
 
-  sample-based learning
+  options used during score generation process
   --learn=N     learn N% of messages as spam or ham
- 
+  --reuse       reuse network checks if X-Spam-Status: is present in messages
+
   non-option arguments are used as target names (mail files and folders),
   the target format is: <class>:<format>:<location>
   <class>       is "spam" or "ham"
@@ -83,11 +84,13 @@
 	    $opt_mid $opt_net $opt_nosort $opt_progress $opt_showdots
 	    $opt_spamlog $opt_tail $opt_rules $opt_restart $opt_loguris
 	    $opt_logmem $opt_after $opt_before $opt_rewrite $opt_deencap
-	    $opt_learn
-	    $total_messages $statusevery);
+	    $opt_learn $opt_reuse
+	    $total_messages $statusevery
+	    %reuse);
 
 use FindBin;
 use lib "$FindBin::Bin/../lib";
+use lib "$FindBin::Bin/tmp";
 eval "use bytes";
 use Mail::SpamAssassin::ArchiveIterator;
 use Mail::SpamAssassin;
@@ -96,6 +99,9 @@
 use constant HAS_TIME_PARSEDATE => eval { require Time::ParseDate; };
 use Config;
 
+# for reuse, score set doesn't matter
+require "rules.pl";
+
 # default settings
 $opt_c = "$FindBin::Bin/../rules";
 $opt_p = "$FindBin::Bin/spamassassin";
@@ -109,7 +115,7 @@
 	   "hamlog=s", "head=i", "loghits", "mh", "mid", "ms", "net",
 	   "progress", "rewrite:s", "showdots", "spamlog=s", "tail=i",
 	   "rules=s", "restart=i", "after=s", "before=s", "loguris",
-	   "deencap=s", "logmem", "learn=i",
+	   "deencap=s", "logmem", "learn=i", "reuse",
 	   "dir" => sub { $opt_format = "dir"; },
 	   "file" => sub { $opt_format = "file"; },
 	   "mbox" => sub { $opt_format = "mbox"; },
@@ -302,6 +308,19 @@
 
   # remove SpamAssassin markup, if present and the mail was spam
   my $header = $ma->get_header("Received");
+  my $x_spam_status;
+  if ($opt_reuse) {
+    # get X-Spam-Status: header for rule hit resue
+    $x_spam_status = $ma->get_header("X-Spam-Status");
+  }
+  # previous hits
+  my @previous;
+  if ($x_spam_status) {
+    $x_spam_status =~ s/,\s+/,/gs;
+    if ($x_spam_status =~ m/tests=(.*)(?:\s|$)/g) {
+      push @previous, split(/,/, $1);
+    }
+  }
   if ($header && $header =~ /\bwith SpamAssassin\b/) {
     if (!$opt_deencap || message_should_be_deencapped($ma)) {
       my $new_ma = $spamtest->parse($spamtest->remove_spamassassin_markup($ma), 1);
@@ -330,9 +349,17 @@
 
   # sample-based learning
   if ($opt_learn > 0) {
-    # should be changed to be deterministic
-    if (rand(100) < $opt_learn) {
-      my $spam;
+    my $spam;
+    # spam learned as ham = 0.05%
+    if ($class eq 's' && rand(100) < 0.05) {
+      $spam = 0;
+    }
+    # ham learned as spam = 0.01%
+    elsif ($class eq 'h' && rand(100) < 0.01) {
+      $spam = 1;
+    }
+    # spam/ham learned correctly
+    elsif (rand(100) < $opt_learn) {
       if ($class eq 's') {
 	$spam = 1;
       }
@@ -342,6 +369,8 @@
       else {
 	die "unknown class, learning failed";
       }
+    }
+    if (defined $spam) {
       my $result = ($spam ? "spam" : "ham");
       my $status = $spamtest->learn($ma, undef, $spam, 0);
       $learned = $status->did_learn();
@@ -393,8 +422,22 @@
     $extra = '';
   } else {
     $yorn = $status->is_spam() ? 'Y' : '.';
+    # don't bother adjusting scores for reuse
     $score = $status->get_score();
-    $tests = join(",", sort(grep(length,$status->get_names_of_tests_hit(),$status->get_names_of_subtests_hit())));
+    # list of tests hit
+    my @tests;
+    push @tests, split(/,/, $status->get_names_of_tests_hit());
+    push @tests, split(/,/, $status->get_names_of_subtests_hit());
+    # hit reuse
+    if ($x_spam_status) {
+      # generate mapping of hits to remove hits that are marked as skip
+      @tests = grep { !$reuse{$_}->{skip} } @tests;
+      # add hits from previous
+      for (@previous) {
+	push(@tests, $reuse{$_}->{reuse}) if $reuse{$_}->{reuse};
+      }
+    }
+    $tests = join(",", sort(@tests));
     $extra = join(",", @extra);
   }
 

Modified: spamassassin/trunk/masses/parse-rules-for-masses
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/masses/parse-rules-for-masses?view=diff&r1=153757&r2=153758
==============================================================================
--- spamassassin/trunk/masses/parse-rules-for-masses (original)
+++ spamassassin/trunk/masses/parse-rules-for-masses Mon Feb 14 02:04:49 2005
@@ -54,6 +54,7 @@
 $scoreset = 0 if ( !defined $scoreset );
 
 my $rules = { };
+my $reuse = { };
 readrules(@rulesdirs);
 
 my $scores = { };
@@ -85,6 +86,9 @@
           $scores_mutable = 1;
         }
 
+	# oh, this is a dirty dirty hack, but we don't need this at runtime
+	s/^#reuse/reuse/;
+
         s/#.*$//g; s/^\s+//; s/\s+$//; next if /^$/;
 
 	# TODO: this could be overwriting stuff
@@ -125,7 +129,16 @@
 	  }
           $rules->{$name}->{score} = $score;
           $rules->{$name}->{mutable} = $scores_mutable;
-        }
+        } elsif (/^reuse\s+(.*)$/) {
+	  my ($new, @old) = split(' ', $1);
+	  push @old, $new;
+	  for my $old (@old) {
+	    $reuse->{$old} ||= { };
+	    $reuse->{$old}->{reuse} = $new;
+	  }
+	  $reuse->{$new} ||= { };
+	  $reuse->{$new}->{skip} = 1;
+	}
       }
       close IN;
     }
@@ -163,7 +176,7 @@
   print OUT "# dumped at ".`date`."\n";
 
   $Data::Dumper::Purity = 1;
-  print OUT Data::Dumper->Dump ([$rules, $scores], ['*rules', '*scores']);
+  print OUT Data::Dumper->Dump ([$rules, $scores, $reuse], ['*rules', '*scores', '*reuse']);
 
   print OUT "1;";
   close OUT;