You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2005/05/20 04:30:41 UTC

svn commit: r171037 - in /spamassassin/trunk/masses/bayes-testing: README bayes-10pcv-driver bayes-static-thresholds bayes-thresholds draw-bayes-histogram graph-bayes-histogram

Author: jm
Date: Thu May 19 19:30:40 2005
New Revision: 171037

URL: http://svn.apache.org/viewcvs?rev=171037&view=rev
Log:
updated bayes-testing code to work again; add GNUplot graphing script

Added:
    spamassassin/trunk/masses/bayes-testing/graph-bayes-histogram   (with props)
Modified:
    spamassassin/trunk/masses/bayes-testing/README
    spamassassin/trunk/masses/bayes-testing/bayes-10pcv-driver
    spamassassin/trunk/masses/bayes-testing/bayes-static-thresholds
    spamassassin/trunk/masses/bayes-testing/bayes-thresholds
    spamassassin/trunk/masses/bayes-testing/draw-bayes-histogram

Modified: spamassassin/trunk/masses/bayes-testing/README
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/masses/bayes-testing/README?rev=171037&r1=171036&r2=171037&view=diff
==============================================================================
--- spamassassin/trunk/masses/bayes-testing/README (original)
+++ spamassassin/trunk/masses/bayes-testing/README Thu May 19 19:30:40 2005
@@ -49,6 +49,7 @@
 
 Then split the test corpus into folds:
 
+  mkdir -p cor/ham cor/spam
   $SADIR/tools/split_corpora -n 10 -p cor/ham/bucket ch
   $SADIR/tools/split_corpora -n 10 -p cor/spam/bucket cs
 

Modified: spamassassin/trunk/masses/bayes-testing/bayes-10pcv-driver
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/masses/bayes-testing/bayes-10pcv-driver?rev=171037&r1=171036&r2=171037&view=diff
==============================================================================
--- spamassassin/trunk/masses/bayes-testing/bayes-10pcv-driver (original)
+++ spamassassin/trunk/masses/bayes-testing/bayes-10pcv-driver Thu May 19 19:30:40 2005
@@ -28,9 +28,6 @@
 # this, since bayes will not be activated without 200 messages in the db,
 # and each fold is run using 10% of the corpus -- and 2000/10 = 200.
 
-# CHANGE ME: the path to the version of SpamAssassin you are testing.
-SADIR=/home/jm/ftp/spamassassin
-
 ###########################################################################
 
 testdir=`pwd`
@@ -58,12 +55,11 @@
 echo "
 
 bayes_path                $tmpdir/dbs/bayes
-bayes_use_chi2_combining  1
 bayes_auto_learn          0
+bayes_min_ham_num         10
+bayes_min_spam_num        10
 
 " > $tmpdir/rules/30bayes_path.cf
-# bayes_expiry_use_scan_count 0
-# bayes_expiry_scan_count 500
 mkdir $tmpdir/dbs
 
 INTERLEAVE_TESTS=0
@@ -89,14 +85,14 @@
 
   (
   echo -n "Learning from all ham buckets..." ; date
-  time sa-learn --ham --randseed=1 --no-rebuild $learnargs \
+  time sa-learn --ham --randseed=1 --no-sync $learnargs \
 	  --showdots --mbox --config-file=$tmpdir/rules $testdir/cor/ham/*
 
   echo -n "Learning from all spam buckets..." ; date
-  time sa-learn --spam --randseed=1 --no-rebuild $learnargs \
+  time sa-learn --spam --randseed=1 --no-sync $learnargs \
 	  --showdots --mbox --config-file=$tmpdir/rules $testdir/cor/spam/*
 
-  time sa-learn --rebuild $learnargs --config-file=$tmpdir/rules
+  time sa-learn --sync $learnargs --config-file=$tmpdir/rules
 
   echo -n "Done learning. " ; date
   ) 2>&1 | tee $results/learn.log
@@ -151,21 +147,21 @@
 
   else
     echo "Learning contents of learn ham bucket..."
-    time sa-learn --ham --randseed=1 --no-rebuild $learnargs \
+    time sa-learn --ham --randseed=1 --no-sync $learnargs \
 	    --showdots --mbox --config-file=$tmpdir/rules $rdir/hbucketlearn
 
     echo "Learning contents of learn spam bucket..."
-    time sa-learn --spam --randseed=1 --no-rebuild $learnargs \
+    time sa-learn --spam --randseed=1 --no-sync $learnargs \
 	    --showdots --mbox --config-file=$tmpdir/rules $rdir/sbucketlearn
 
-    time sa-learn --rebuild $learnargs --config-file=$tmpdir/rules
+    time sa-learn --sync $learnargs --config-file=$tmpdir/rules
 
     echo "Dumping bayes DB..."
     ( cd .. ; sa-learn --dump --dbpath=$tmpdir/dbs/bayes ) \
 	  > $rdir/bayes_db.dump
   fi
 
-  time sa-learn --rebuild --config-file=$tmpdir/rules
+  time sa-learn --sync --config-file=$tmpdir/rules
 
   if [ $INTERLEAVE_TESTS = 1 ] ; then
     # now split the ham and spam test bucket into 10 sub-buckets,

Modified: spamassassin/trunk/masses/bayes-testing/bayes-static-thresholds
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/masses/bayes-testing/bayes-static-thresholds?rev=171037&r1=171036&r2=171037&view=diff
==============================================================================
--- spamassassin/trunk/masses/bayes-testing/bayes-static-thresholds (original)
+++ spamassassin/trunk/masses/bayes-testing/bayes-static-thresholds Thu May 19 19:30:40 2005
@@ -9,8 +9,8 @@
 my $spam = $ARGV[0] || "spam.log";
 my $nonspam = $ARGV[1] || (-f "good.log" ? "good.log" : "nonspam.log");
 
-my $hamcutoff = 0.30;
-my $spamcutoff = 0.70;
+my $hamcutoff = 0.20;
+my $spamcutoff = 0.80;
 
 my $nbuckets = 50;
 my $range_lo = 0.00;
@@ -46,7 +46,7 @@
   my $isspam = 0; ($file eq $spam) and $isspam = 1;
 
   while (<IN>) {
-    /^(\.|Y)\s.+bayes=(\S+)$/ or next;
+    /^(\.|Y)\s.+bayes=([^\s,]+)/ or next;
     my $score = $2+0;
     if ($score == 1) { $score = 0.9999999999999; }
 

Modified: spamassassin/trunk/masses/bayes-testing/bayes-thresholds
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/masses/bayes-testing/bayes-thresholds?rev=171037&r1=171036&r2=171037&view=diff
==============================================================================
--- spamassassin/trunk/masses/bayes-testing/bayes-thresholds (original)
+++ spamassassin/trunk/masses/bayes-testing/bayes-thresholds Thu May 19 19:30:40 2005
@@ -42,7 +42,7 @@
   my $isspam = 0; ($file eq $spam) and $isspam = 1;
 
   while (<IN>) {
-    /^(\.|Y)\s.+bayes=(\S+)$/ or next;
+    /^(\.|Y)\s.+bayes=([^\s,]+)/ or next;
     my $score = $2+0;
 
     my $bucket_id;

Modified: spamassassin/trunk/masses/bayes-testing/draw-bayes-histogram
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/masses/bayes-testing/draw-bayes-histogram?rev=171037&r1=171036&r2=171037&view=diff
==============================================================================
--- spamassassin/trunk/masses/bayes-testing/draw-bayes-histogram (original)
+++ spamassassin/trunk/masses/bayes-testing/draw-bayes-histogram Thu May 19 19:30:40 2005
@@ -48,7 +48,7 @@
   my $isspam = 0; ($file eq $spam) and $isspam = 1;
 
   while (<IN>) {
-    /^(\.|Y)\s.+bayes=(\S+)$/ or next;
+    /^(\.|Y)\s.+bayes=([^\s,]+)/ or next;
     my $score = $2+0;
 
     my $bucket_id;

Added: spamassassin/trunk/masses/bayes-testing/graph-bayes-histogram
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/masses/bayes-testing/graph-bayes-histogram?rev=171037&view=auto
==============================================================================
--- spamassassin/trunk/masses/bayes-testing/graph-bayes-histogram (added)
+++ spamassassin/trunk/masses/bayes-testing/graph-bayes-histogram Thu May 19 19:30:40 2005
@@ -0,0 +1,125 @@
+#!/usr/bin/perl -w
+#
+# Given a 'results' dir from a bayes-10pcv-driver run,
+# graph a histogram of the score ranges using GNUPlot.
+#
+# usage: graph-bayes-histogram [--buckets=100] ...dir/results .../dir2/results ...
+#
+# <@LICENSE>
+# Copyright 2004 Apache Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </...@LICENSE>
+
+
+use Getopt::Long;
+use vars qw($opt_buckets);
+
+GetOptions("buckets=i");
+
+my $buckets = $opt_buckets || 100;
+my $range_lo = 0.0;
+my $range_hi = 1.0;
+
+%bux_sp = ();
+%bux_ns = ();
+
+my $step = ($range_hi - $range_lo) / $buckets;
+my $i;
+for ($i = $range_lo; $i <= $range_hi; $i += $step) {
+  push (@buckets, $i);
+}
+
+open(DATA, ">plot.data");
+my $setcount = 0;
+my %tag = ();
+my @dirs = ();
+foreach my $dir (@ARGV) {
+  for ($i = $range_lo; $i <= $range_hi; $i += $step) {
+    $bux_ns{$i} = $bux_sp{$i} = 0;
+  }
+
+  dofile($setcount, "$dir/spam_all.log", "$dir/nonspam_all.log");
+  push (@dirs, $dir);
+  $tag{$dir} = $setcount;
+  $setcount++;
+}
+close DATA;
+
+open (OUT, "| gnuplot -") or die "cannot run gnuplot";
+select(OUT);
+
+print "
+set xlabel 'P(spam)'
+set ylabel 'Frequency'
+set logscale y 2
+set xrange [0.0:1.01]
+set yrange []
+set xtics 0,0.1,0.99
+set terminal png crop
+set out 'graph.png'
+
+plot ";
+
+my @text = ();
+my $t = 0;
+foreach my $dir (@dirs) {
+  my $s = $tag{$dir};
+  $t++; push (@text, "  'plot.data' using 1:2 index $s with linesp lt $t pt $t t 'ham, $dir'");
+  $t++; push (@text, "  'plot.data' using 1:3 index $s with linesp lt $t pt $t t 'spam, $dir'");
+}
+
+print join(", \\\n", @text);
+print "\n";
+
+close OUT;
+exit;
+
+
+sub dofile {
+  my ($setcount, $spam, $nonspam) = @_;
+
+  foreach my $file ($spam, $nonspam) {
+    open (IN, "<$file") || die "Could not open file '$file': $!";
+
+    my $isspam = 0; ($file eq $spam) and $isspam = 1;
+
+    while (<IN>) {
+      /^(\.|Y)\s.+bayes=([^\s,]+)/ or next;
+      my $score = $2+0;
+
+      my $bucket_id;
+      foreach my $bucket (@buckets) {
+        if ($score >= $bucket && $score < $bucket+$step) {
+          $bucket_id = $bucket; last;
+        }
+      }
+
+      if ($isspam) {
+        $bux_sp{$bucket_id}++;
+      } else {
+        $bux_ns{$bucket_id}++;
+      }
+    }
+  }
+
+  my $sideoffset = 0.001*$setcount;
+  foreach my $bucket (@buckets) {
+    my $ns = $bux_ns{$bucket};
+    my $sp = $bux_sp{$bucket};
+    my $xpos = $bucket + $sideoffset;
+    print DATA "$xpos $ns $sp\n";
+  }
+  print DATA "\n\n";
+}
+

Propchange: spamassassin/trunk/masses/bayes-testing/graph-bayes-histogram
------------------------------------------------------------------------------
    svn:executable = *