You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2004/07/22 06:07:14 UTC

svn commit: rev 23133 - in spamassassin/trunk/masses: . tenpass

Author: jm
Date: Wed Jul 21 21:07:13 2004
New Revision: 23133

Added:
   spamassassin/trunk/masses/compare-models
   spamassassin/trunk/masses/config.set0
   spamassassin/trunk/masses/config.set1
   spamassassin/trunk/masses/extract-results
   spamassassin/trunk/masses/generate-corpus
   spamassassin/trunk/masses/tenpass/split-log-into-buckets-random   (contents, props changed)
   spamassassin/trunk/masses/validate-model
Modified:
   spamassassin/trunk/masses/config
   spamassassin/trunk/masses/logs-to-c
   spamassassin/trunk/masses/mk-baseline-results
   spamassassin/trunk/masses/perceptron.c
   spamassassin/trunk/masses/runGA
Log:
bug 3584: improvements to score learning system; lots, too many to list here.  Henry's patch

Added: spamassassin/trunk/masses/compare-models
==============================================================================
--- (empty file)
+++ spamassassin/trunk/masses/compare-models	Wed Jul 21 21:07:13 2004
@@ -0,0 +1,322 @@
+#!/usr/bin/perl
+
+# This script is used to do a statistical comparative analysis of 
+
+use Statistics::Distributions;
+use strict;
+
+my $alpha = 0.95; # acceptable confidence in not making a type 1 error
+		  # i.e. assume that the means are different
+my $lambda = 50;  # desired lambda for TCR calculation
+
+if ( scalar(@ARGV) < 2 ) {
+	print STDERR "Usage: compare-models [validate1] [validate2]\n";
+	exit 1;
+}
+
+my (@fp1, @fn1, @tcr1);
+
+open (FILE, $ARGV[0]) || die $!;
+while (<FILE>) {
+	my @x = split(/\s+/);
+	push (@fp1, $x[2] / ($x[0] + $x[2]));
+	push (@fn1, $x[3] / ($x[1] + $x[3]));
+	push (@tcr1, $x[1] / ($x[3] + $lambda * $x[2]));
+}
+close (FILE);
+
+my (@fp2, @fn2, @tcr2);
+
+open (FILE, $ARGV[1]) || die $!;
+while (<FILE>) {
+	my @x = split(/\s+/);
+	push (@fp2, $x[2] / ($x[0] + $x[2]));
+	push (@fn2, $x[3] / ($x[1] + $x[3]));
+	push (@tcr2, $x[1] / ($x[3] + $lambda * $x[2]));
+}
+close (FILE);
+
+stat_analysis ("False positives", "pct", \@fp1, \@fp2);
+stat_analysis ("False negatives", "pct", \@fn1, \@fn2);
+stat_analysis ("TCR (lambda=$lambda)", "lin", \@tcr1, \@tcr2);
+
+sub stat_analysis {
+	my $title = shift;
+	my $pct = shift;
+	my $s1 = shift;
+	my $s2 = shift;
+
+	unless ( scalar(@$s1) == scalar(@$s1) ) {
+		print STDERR "Can't compute stats for $title.  Samples are not paired.\n";
+		return;
+	}
+
+	# This is the number of degrees of freedom of the two sample sets (i.e.
+	# the number of samples in each set).
+	my $dof = scalar(@{$s1});
+
+	print "$title:\n";
+
+	# Compute the mean and standard deviation of the first sample
+	# mean = 1/n * sum(s[i])
+	my $mean_s1 = 0;
+	foreach my $i (1..$dof) {
+		$mean_s1 += $$s1[$i];
+	}
+	$mean_s1 /= $dof;
+
+	# var = 1/(n-1) * sum((mean - s[i])^2)
+	my $var_s1 = 0;
+	foreach my $i (1..$dof) {
+		$var_s1 += ($mean_s1 - $$s1[$i])**2;
+	}
+	$var_s1 /= $dof - 1;
+
+	# std = sqrt(var)
+	my $std_s1 = sqrt($var_s1);
+
+	# Compute the mean and standard deviation of the second sample
+	# mean = 1/n * sum(s[i])
+	my $mean_s2 = 0;
+	foreach my $i (1..$dof) {
+		$mean_s2 += $$s2[$i];
+	}
+	$mean_s2 /= $dof;
+
+	# var = 1/(n-1) * sum((mean - s[i])^2)
+	my $var_s2 = 0;
+	foreach my $i (1..$dof) {
+		$var_s2 += ($mean_s2 - $$s2[$i])**2;
+	}
+	$var_s2 /= $dof - 1;
+
+	# std = sqrt(var)
+	my $std_s2 = sqrt($var_s2);
+
+	# SA developers like percentage points instead of probabilities.
+	if ( $pct eq "pct" ) {
+		printf "\tSample 1: mean=%0.4f%% std=%0.4f\n",100*$mean_s1,100*$std_s1;
+		printf "\tSample 2: mean=%0.4f%% std=%0.4f\n",100*$mean_s2,100*$std_s2;
+	} else {
+		printf "\tSample 1: mean=%0.4f std=%0.4f\n",$mean_s1,$std_s1;
+		printf "\tSample 2: mean=%0.4f std=%0.4f\n",$mean_s2,$std_s2;
+	}
+
+	# Compute the mean of the differences between the two samples
+	my $mean_d = 0;
+	foreach my $i (1..$dof) {
+		$mean_d += $$s1[$i] - $$s2[$i];
+	}
+	$mean_d /= $dof;
+
+	# Compute the variance of the differences between the two samples
+	my $var_d = 0;
+	foreach my $i (1..$dof) {
+		$var_d += ($mean_d - $$s1[$i] + $$s2[$i])**2;
+	}
+	$var_d /= $dof - 1;
+	my $std_d = sqrt($var_d);
+
+	# To determine whether two samples are from the same distribution
+	# (i.e. they have the same mean), we are going to use a paired sample
+	# t-test.  You can find more information about this in Tom Mitchell's
+	# "Machine Learning" book.
+	
+	# Let t = mean / (var * sqrt(n))
+	my $tstat;
+	if ( $var_d > 0 ) {
+		$tstat = $mean_d / sqrt($var_d / $dof);
+	} else {
+		$tstat = 0;
+	}
+
+	# Now we find the critical value of t for the alpha% confidence
+	# interval.
+	my $tcrit = Statistics::Distributions::tdistr ($dof, (1-$alpha)/2);
+
+	# This is the probability that the two distributions are from different
+	# means.
+	my $tprob = 1-Statistics::Distributions::tprob ($dof, abs($tstat))/2;
+
+	# If the t statistic is less than the critical value, this means that
+	# we should accept the null hypothesis (the distributions have the same
+	# mean), otherwise it should be accepted.
+	if ( abs($tstat) < $tcrit ) {
+		printf "\tNot statistically significantly different (alpha=%0.4f)\n", $alpha;
+	} else {
+		printf "\tStatistically significantly different with confidence %0.4f%%\n", 100*$tprob;
+	}
+
+	# This displays an estimate of the confidence interval around the
+	# estimated mean difference between the two samples.  Bear in mind that
+	# the t statistic is working on the local differences and not the
+	# global difference.
+	if ( $pct eq "pct" ) {
+		printf "\tEstimated difference: %0.4f%% +/- %0.4f\n", 100*$mean_d, 100*$std_d*$tcrit;
+	} else {
+		printf "\tEstimated difference: %0.4f +/- %0.4f\n", $mean_d, $std_d*$tcrit;
+	}
+
+	print "\n";
+}
+#!/usr/bin/perl
+
+# This script is used to do a statistical comparative analysis of 
+
+use Statistics::Distributions;
+use strict;
+
+my $alpha = 0.95; # acceptable confidence in not making a type 1 error
+		  # i.e. assume that the means are different
+my $lambda = 50;  # desired lambda for TCR calculation
+
+if ( scalar(@ARGV) < 2 ) {
+	print STDERR "Usage: compare-models [validate1] [validate2]\n";
+	exit 1;
+}
+
+my (@fp1, @fn1, @tcr1);
+
+open (FILE, $ARGV[0]) || die $!;
+while (<FILE>) {
+	my @x = split(/\s+/);
+	push (@fp1, $x[2] / ($x[0] + $x[2]));
+	push (@fn1, $x[3] / ($x[1] + $x[3]));
+	push (@tcr1, $x[1] / ($x[3] + $lambda * $x[2]));
+}
+close (FILE);
+
+my (@fp2, @fn2, @tcr2);
+
+open (FILE, $ARGV[1]) || die $!;
+while (<FILE>) {
+	my @x = split(/\s+/);
+	push (@fp2, $x[2] / ($x[0] + $x[2]));
+	push (@fn2, $x[3] / ($x[1] + $x[3]));
+	push (@tcr2, $x[1] / ($x[3] + $lambda * $x[2]));
+}
+close (FILE);
+
+stat_analysis ("False positives", "pct", \@fp1, \@fp2);
+stat_analysis ("False negatives", "pct", \@fn1, \@fn2);
+stat_analysis ("TCR (lambda=$lambda)", "lin", \@tcr1, \@tcr2);
+
+sub stat_analysis {
+	my $title = shift;
+	my $pct = shift;
+	my $s1 = shift;
+	my $s2 = shift;
+
+	unless ( scalar(@$s1) == scalar(@$s1) ) {
+		print STDERR "Can't compute stats for $title.  Samples are not paired.\n";
+		return;
+	}
+
+	# This is the number of degrees of freedom of the two sample sets (i.e.
+	# the number of samples in each set).
+	my $dof = scalar(@{$s1});
+
+	print "$title:\n";
+
+	# Compute the mean and standard deviation of the first sample
+	# mean = 1/n * sum(s[i])
+	my $mean_s1 = 0;
+	foreach my $i (1..$dof) {
+		$mean_s1 += $$s1[$i];
+	}
+	$mean_s1 /= $dof;
+
+	# var = 1/(n-1) * sum((mean - s[i])^2)
+	my $var_s1 = 0;
+	foreach my $i (1..$dof) {
+		$var_s1 += ($mean_s1 - $$s1[$i])**2;
+	}
+	$var_s1 /= $dof - 1;
+
+	# std = sqrt(var)
+	my $std_s1 = sqrt($var_s1);
+
+	# Compute the mean and standard deviation of the second sample
+	# mean = 1/n * sum(s[i])
+	my $mean_s2 = 0;
+	foreach my $i (1..$dof) {
+		$mean_s2 += $$s2[$i];
+	}
+	$mean_s2 /= $dof;
+
+	# var = 1/(n-1) * sum((mean - s[i])^2)
+	my $var_s2 = 0;
+	foreach my $i (1..$dof) {
+		$var_s2 += ($mean_s2 - $$s2[$i])**2;
+	}
+	$var_s2 /= $dof - 1;
+
+	# std = sqrt(var)
+	my $std_s2 = sqrt($var_s2);
+
+	# SA developers like percentage points instead of probabilities.
+	if ( $pct eq "pct" ) {
+		printf "\tSample 1: mean=%0.4f%% std=%0.4f\n",100*$mean_s1,100*$std_s1;
+		printf "\tSample 2: mean=%0.4f%% std=%0.4f\n",100*$mean_s2,100*$std_s2;
+	} else {
+		printf "\tSample 1: mean=%0.4f std=%0.4f\n",$mean_s1,$std_s1;
+		printf "\tSample 2: mean=%0.4f std=%0.4f\n",$mean_s2,$std_s2;
+	}
+
+	# Compute the mean of the differences between the two samples
+	my $mean_d = 0;
+	foreach my $i (1..$dof) {
+		$mean_d += $$s1[$i] - $$s2[$i];
+	}
+	$mean_d /= $dof;
+
+	# Compute the variance of the differences between the two samples
+	my $var_d = 0;
+	foreach my $i (1..$dof) {
+		$var_d += ($mean_d - $$s1[$i] + $$s2[$i])**2;
+	}
+	$var_d /= $dof - 1;
+	my $std_d = sqrt($var_d);
+
+	# To determine whether two samples are from the same distribution
+	# (i.e. they have the same mean), we are going to use a paired sample
+	# t-test.  You can find more information about this in Tom Mitchell's
+	# "Machine Learning" book.
+	
+	# Let t = mean / (var * sqrt(n))
+	my $tstat;
+	if ( $var_d > 0 ) {
+		$tstat = $mean_d / sqrt($var_d / $dof);
+	} else {
+		$tstat = 0;
+	}
+
+	# Now we find the critical value of t for the alpha% confidence
+	# interval.
+	my $tcrit = Statistics::Distributions::tdistr ($dof, (1-$alpha)/2);
+
+	# This is the probability that the two distributions are from different
+	# means.
+	my $tprob = 1-Statistics::Distributions::tprob ($dof, abs($tstat))/2;
+
+	# If the t statistic is less than the critical value, this means that
+	# we should accept the null hypothesis (the distributions have the same
+	# mean), otherwise it should be accepted.
+	if ( abs($tstat) < $tcrit ) {
+		printf "\tNot statistically significantly different (alpha=%0.4f)\n", $alpha;
+	} else {
+		printf "\tStatistically significantly different with confidence %0.4f%%\n", 100*$tprob;
+	}
+
+	# This displays an estimate of the confidence interval around the
+	# estimated mean difference between the two samples.  Bear in mind that
+	# the t statistic is working on the local differences and not the
+	# global difference.
+	if ( $pct eq "pct" ) {
+		printf "\tEstimated difference: %0.4f%% +/- %0.4f\n", 100*$mean_d, 100*$std_d*$tcrit;
+	} else {
+		printf "\tEstimated difference: %0.4f +/- %0.4f\n", $mean_d, $std_d*$tcrit;
+	}
+
+	print "\n";
+}

Modified: spamassassin/trunk/masses/config
==============================================================================
--- spamassassin/trunk/masses/config	(original)
+++ spamassassin/trunk/masses/config	Wed Jul 21 21:07:13 2004
@@ -1 +1,5 @@
 SCORESET=0
+HAM_PREFERENCE=2.0
+THRESHOLD=4.25
+EPOCHS=100
+NOTE=

Added: spamassassin/trunk/masses/config.set0
==============================================================================
--- (empty file)
+++ spamassassin/trunk/masses/config.set0	Wed Jul 21 21:07:13 2004
@@ -0,0 +1,15 @@
+SCORESET=0
+HAM_PREFERENCE=2.0
+THRESHOLD=4.25
+EPOCHS=100
+NOTE=
+SCORESET=0
+HAM_PREFERENCE=2.0
+THRESHOLD=4.25
+EPOCHS=100
+NOTE=
+SCORESET=0
+HAM_PREFERENCE=2.0
+THRESHOLD=4.25
+EPOCHS=100
+NOTE=

Added: spamassassin/trunk/masses/config.set1
==============================================================================
--- (empty file)
+++ spamassassin/trunk/masses/config.set1	Wed Jul 21 21:07:13 2004
@@ -0,0 +1,15 @@
+SCORESET=1
+HAM_PREFERENCE=2.0
+THRESHOLD=4.7
+EPOCHS=100
+NOTE=
+SCORESET=1
+HAM_PREFERENCE=2.0
+THRESHOLD=4.7
+EPOCHS=100
+NOTE=
+SCORESET=1
+HAM_PREFERENCE=2.0
+THRESHOLD=4.7
+EPOCHS=100
+NOTE=

Added: spamassassin/trunk/masses/extract-results
==============================================================================
--- (empty file)
+++ spamassassin/trunk/masses/extract-results	Wed Jul 21 21:07:13 2004
@@ -0,0 +1,62 @@
+#!/usr/bin/perl
+
+# This script extracts the confusion matrix (tp, tn, fp, fn) from the output
+# of the logs-to-c program.
+#
+# This is used by the validate-model script to aggregate the results of a
+# cross validation for analysis with the compare-models script.
+
+use strict;
+
+foreach my $file (@ARGV) {
+	open (FILE, "<$file") || die $!;
+
+	my ($tp, $tn, $fp, $fn);
+
+	while (<FILE>) {
+		if ( /Correctly non-spam:\s*(\d+)/ ) {
+			$tn = $1;
+		} elsif ( /Correctly spam:\s*(\d+)/ ) {
+			$tp = $1;
+		} elsif ( /False positives:\s*(\d+)/ ) {
+			$fp = $1;
+		} elsif ( /False negatives:\s*(\d+)/ ) {
+			$fn = $1;
+		}
+	}
+
+	close (FILE);
+
+	printf "%d %d %d %d\n", $tn, $tp, $fp, $fn;
+}
+#!/usr/bin/perl
+
+# This script extracts the confusion matrix (tp, tn, fp, fn) from the output
+# of the logs-to-c program.
+#
+# This is used by the validate-model script to aggregate the results of a
+# cross validation for analysis with the compare-models script.
+
+use strict;
+
+foreach my $file (@ARGV) {
+	open (FILE, "<$file") || die $!;
+
+	my ($tp, $tn, $fp, $fn);
+
+	while (<FILE>) {
+		if ( /Correctly non-spam:\s*(\d+)/ ) {
+			$tn = $1;
+		} elsif ( /Correctly spam:\s*(\d+)/ ) {
+			$tp = $1;
+		} elsif ( /False positives:\s*(\d+)/ ) {
+			$fp = $1;
+		} elsif ( /False negatives:\s*(\d+)/ ) {
+			$fn = $1;
+		}
+	}
+
+	close (FILE);
+
+	printf "%d %d %d %d\n", $tn, $tp, $fp, $fn;
+}

Added: spamassassin/trunk/masses/generate-corpus
==============================================================================
--- (empty file)
+++ spamassassin/trunk/masses/generate-corpus	Wed Jul 21 21:07:13 2004
@@ -0,0 +1,24 @@
+# Clean out the old build cache
+rm -rf vm-cache
+
+#find ~/spamassassin/corpus -mtime -7 -name \*.log |egrep -v -- -net | grep ham- | xargs cat > ORIG/ham-set0.log
+#find ~/spamassassin/corpus -mtime -7 -name \*.log |egrep -v -- -net | grep spam- | xargs cat > ORIG/spam-set0.log
+#find ~/spamassassin/corpus -mtime -7 -name \*.log |egrep -- -net | grep ham-  | xargs cat > ORIG/ham-set1.log
+#find ~/spamassassin/corpus -mtime -7 -name \*.log |egrep -- -net | grep spam-  | xargs cat > ORIG/spam-set1.log
+
+ln -s ham-set1.log ORIG/ham-set0.log
+ln -s spam-set1.log ORIG/spam-set0.log
+cat ~/spamassassin/corpus/submit-3.0.0-sets01/ham-*.log > ORIG/ham-set1.log
+cat ~/spamassassin/corpus/submit-3.0.0-sets01/spam-*.log > ORIG/spam-set1.log
+# Clean out the old build cache
+rm -rf vm-cache
+
+#find ~/spamassassin/corpus -mtime -7 -name \*.log |egrep -v -- -net | grep ham- | xargs cat > ORIG/ham-set0.log
+#find ~/spamassassin/corpus -mtime -7 -name \*.log |egrep -v -- -net | grep spam- | xargs cat > ORIG/spam-set0.log
+#find ~/spamassassin/corpus -mtime -7 -name \*.log |egrep -- -net | grep ham-  | xargs cat > ORIG/ham-set1.log
+#find ~/spamassassin/corpus -mtime -7 -name \*.log |egrep -- -net | grep spam-  | xargs cat > ORIG/spam-set1.log
+
+ln -s ham-set1.log ORIG/ham-set0.log
+ln -s spam-set1.log ORIG/spam-set0.log
+cat ~/spamassassin/corpus/submit-3.0.0-sets01/ham-*.log > ORIG/ham-set1.log
+cat ~/spamassassin/corpus/submit-3.0.0-sets01/spam-*.log > ORIG/spam-set1.log

Modified: spamassassin/trunk/masses/logs-to-c
==============================================================================
--- spamassassin/trunk/masses/logs-to-c	(original)
+++ spamassassin/trunk/masses/logs-to-c	Wed Jul 21 21:07:13 2004
@@ -18,9 +18,10 @@
 
 use Getopt::Long;
 use vars qw($opt_cffile $opt_count $opt_lambda $opt_threshold
-		$opt_spam $opt_ham);
+		$opt_spam $opt_ham $opt_fplog $opt_fnlog);
+
+GetOptions("cffile=s", "count", "lambda=f", "threshold=f", "spam=s", "ham=s", "scoreset=i", "fplog=s", "fnlog=s");
 
-GetOptions("cffile=s", "count", "lambda=f", "threshold=f", "spam=s", "ham=s", "scoreset=i");
 my $argcffile = $opt_cffile;
 
 my $justcount = 0;
@@ -33,6 +34,10 @@
 $opt_ham ||= 'ham.log';
 $opt_scoreset = 0 if ( !defined $opt_scoreset );
 
+# If desired, report false positives and false negatives for analysis
+if (defined $opt_fnlog) { open (FNLOG, ">$opt_fnlog"); }
+if (defined $opt_fplog) { open (FPLOG, ">$opt_fplog"); }
+
 my $nybias = 10;
 
 # lambda value for TCR equation, representing the cost of of an FP vs. the
@@ -84,6 +89,7 @@
     while (<IN>) {
       next unless /^[^#]/;
       if($_ !~ /^.\s+([-\d]+)\s+(\S+)\s*/) { warn "bad line: $_"; next; }
+      my $msgline = $_;
       my $hits = $1;
       #my $id = $2;
       $_ = $'; s/(?:bayes|time)=\S+//; s/,,+/,/g; s/^\s+//; s/\s+$//;
@@ -118,6 +124,9 @@
             $ga_yy++; $yyscore += $score;
           } else {
             $ga_yn++; $ynscore += $score;
+	    if (defined $opt_fnlog) {
+	    	print FNLOG $msgline;
+	    }
           }
         } else {
           $is_spam{$count} = 1;
@@ -128,6 +137,9 @@
           if ($score >= $threshold) {
 	    #print STDERR "FP: $id\n";
             $ga_ny++; $nyscore += $score;
+	    if (defined $opt_fplog) {
+	    	print FPLOG $msgline;
+	    }
           } else {
             $ga_nn++; $nnscore += $score;
           }

Modified: spamassassin/trunk/masses/mk-baseline-results
==============================================================================
--- spamassassin/trunk/masses/mk-baseline-results	(original)
+++ spamassassin/trunk/masses/mk-baseline-results	Wed Jul 21 21:07:13 2004
@@ -12,7 +12,7 @@
 echo "Classification success on test corpora, at default threshold:"
 echo
 
-./logs-to-c --spam=spam-validate.log --ham=ham-validate.log --threshold 5 --count --scoreset=$SCORESET | sed -e 's/^Reading.*//' -e '/^$/d'
+./logs-to-c --spam=spam-test.log --ham=ham-test.log --threshold 5 --count --scoreset=$SCORESET | sed -e 's/^Reading.*//' -e '/^$/d'
 
 echo
 echo "Results on test corpora at various alternative thresholds:"
@@ -20,7 +20,7 @@
 
 # list a wide range of thresholds, so that we can make graphs later ;)
 for thresh in -4 -3 -2 -1 0 1 2 3 4 4.5 5.5 6 6.5 7 8 9 10 12 15 17 20 ; do
-  ./logs-to-c --spam=spam-validate.log --ham=ham-validate.log --threshold $thresh --count --scoreset=$SCORESET | sed -e 's/^Reading.*//' -e '/^$/d'
+  ./logs-to-c --spam=spam-test.log --ham=ham-test.log --threshold $thresh --count --scoreset=$SCORESET | sed -e 's/^Reading.*//' -e '/^$/d'
   echo
 done
 

Modified: spamassassin/trunk/masses/perceptron.c
==============================================================================
--- spamassassin/trunk/masses/perceptron.c	(original)
+++ spamassassin/trunk/masses/perceptron.c	Wed Jul 21 21:07:13 2004
@@ -44,7 +44,7 @@
 #endif
 
 #define OUTPUT_FILE "perceptron.scores"
-#define SCORE_RANGES 1
+/* #define IGNORE_SCORE_RANGES 1 */
 
 void init_wheel ();
 void destroy_wheel ();
@@ -52,6 +52,7 @@
 void init_weights();
 void destroy_weights ();
 void write_weights (FILE * fp);
+void scale_scores (double old_threshold, double new_threshold);
 double evaluate_test (int test);
 double evaluate_test_nogain (int test);
 void train (int num_epochs, double learning_rate);
@@ -65,7 +66,9 @@
 		   SIZE OF THE ROULETTE_WHEEL ARRAY!). */
 int * roulette_wheel; /* Used for roulette wheel selection. */
 double ham_preference = 2.0;
-double threshold = 5.0;
+
+#define DEFAULT_THRESHOLD 5.0
+double threshold = DEFAULT_THRESHOLD;
 
 double * weights; /* The weights of the single-layer perceptron. */
 double bias; /* The network bias for the single-layer perceptron. */
@@ -227,13 +230,38 @@
 
 	for (i = 0; i < num_scores; i++) {
 		if ( is_mutable[i] )  {
-			fprintf(fp, "score %-30s %2.3f\n", score_names[i], weight_to_score(weights[i]));
+			fprintf(fp, "score %-30s %2.3f # [%2.3f..%2.3f]\n", score_names[i], weight_to_score(weights[i]), range_lo[i], range_hi[i]);
 		} else {
 			fprintf(fp, "score %-30s %2.3f # not mutable\n", score_names[i], range_lo[i]);
 		}
 	}
 }
 
+/* This is to support Daniel's threshold thing. */
+void scale_scores (double old_threshold, double new_threshold) {
+	int i;
+
+	/* No need to scale something to itself. */
+	if ( old_threshold == new_threshold ) {
+		return;
+	}
+
+	for (i = 0; i < num_scores; i++) {
+		if ( is_mutable[i] ) {
+			range_lo[i] = range_lo[i] * new_threshold / old_threshold;
+			range_hi[i] = range_hi[i] * new_threshold / old_threshold;
+		}
+	}
+
+	/* Maybe we don't want this bit.  This prescaling stuff makes my
+	 * brain hurt.*/
+	/*
+	for (i = 0; i < num_nondup; i++) {
+		scores[i] = scores[i] * new_threshold / old_threshold;
+	}
+	*/
+}
+
 /* Computes the value of the activation function of the perceptron for
  * a given input. */
 double evaluate_test (int test) {
@@ -327,12 +355,14 @@
 #endif
 	
 			/* adjust the weights to descend the steepest part of the error gradient */
-			bias += delta;
+			if ( epoch + 1 < num_epochs ) {
+				bias += delta;
+			}
 			for (i = 0; i < num_tests_hit[random_test]; i++) {
 				int idx = tests_hit[random_test][i];
 				weights[idx] += delta;
 
-#ifdef SCORE_RANGES
+#ifdef IGNORE_SCORE_RANGES
 				/* Constrain the weights so that nice rules are always <= 0 etc. */
 				if ( range_lo[idx] >= 0 && weights[idx] < 0 ) {
 					weights[idx] = 0;
@@ -411,6 +441,11 @@
 	/* Load the instances and score constraints generated by logs-to-c. */
 	loadtests();
 	loadscores();
+
+	/* If the threshold has been changed, the ranges and scores need to be
+	 * scaled so that the output of the program will not be affected.
+	 */
+	scale_scores (DEFAULT_THRESHOLD, threshold);
 
 	/* Replicate instances from the training set to bias against false positives. */
 	init_wheel ();

Modified: spamassassin/trunk/masses/runGA
==============================================================================
--- spamassassin/trunk/masses/runGA	(original)
+++ spamassassin/trunk/masses/runGA	Wed Jul 21 21:07:13 2004
@@ -4,9 +4,11 @@
 . config
 
 NAME="set$SCORESET"
+LOGDIR="gen-$NAME-$HAM_PREFERENCE-$THRESHOLD-$EPOCHS"
 
-# beware!
-svn revert ../rules/50_scores.cf
+if [ "$NOTE" != "" ]; then
+	LOGDIR="$LOGDIR-$NOTE"
+fi
 
 if [ ! -f "ORIG/ham-$NAME.log" -o ! -f "ORIG/spam-$NAME.log" ]; then
 	echo "Couldn't find logs for $NAME" >&2
@@ -14,73 +16,77 @@
 fi
 
 if [ "x$1" = "x" ]; then
+# This should be in here instead.  Prevents testing.
+svn revert ../rules/50_scores.cf
+
 echo "[Doing a scoreset $SCORESET score-generation run]"
 
 # Clean out old runs
 echo "[Cleaning up]"
-rm -rf spam-validate.log ham-validate.log spam.log ham.log \
+rm -rf spam-test.log ham-test.log spam.log ham.log \
 	NSBASE SPBASE tmp make.output freqs perceptron.scores \
-	gen-$NAME.out gen-$NAME.scores gen-$NAME.validate
+	$LOGDIR
 make clean >/dev/null
 
+# Create a directory to organize the logs with this group of settings
+mkdir $LOGDIR
+
 # Generate 90/10 split logs
 echo "[Generating 90/10 split ham]"
 mkdir NSBASE SPBASE
 cd NSBASE
-../tenpass/split-log-into-buckets 10 < ../ORIG/ham-$NAME.log > /dev/null
+../tenpass/split-log-into-buckets-random 10 < ../ORIG/ham-$NAME.log > /dev/null
 cat split-[1-9].log > ham.log
 rm -f split-[1-9].log
-mv split-10.log ham-validate.log
+mv split-10.log ham-test.log
 
 echo "[Generating 90/10 split spam]"
 cd ../SPBASE
-../tenpass/split-log-into-buckets 10 < ../ORIG/spam-$NAME.log > /dev/null
+../tenpass/split-log-into-buckets-random 10 < ../ORIG/spam-$NAME.log > /dev/null
 cat split-[1-9].log > spam.log
 rm -f split-[1-9].log
-mv split-10.log spam-validate.log
+mv split-10.log spam-test.log
 cd ..
 
 echo "[Setting up for gen run]"
 # Ok, setup for a run
 ln -s SPBASE/spam.log .
 ln -s NSBASE/ham.log .
-ln -s SPBASE/spam-validate.log .
-ln -s NSBASE/ham-validate.log .
+ln -s SPBASE/spam-test.log .
+ln -s NSBASE/ham-test.log .
 
 # try to find number of processors
 numcpus=`cpucount 2>/dev/null || egrep -c '^processor\b' /proc/cpuinfo 2>/dev/null || echo 1`
 
 echo "[Generating perceptron]"
 # Generate perceptron with full logs
-make -j $numcpus SCORESET=$SCORESET > make.output 2>&1
+make -j $numcpus SCORESET=$SCORESET > $LOGDIR/make.output 2>&1
 
-for threshold in 5.0 4.9 4.8 4.7 4.6 4.5 4.4 4.3 4.2 4.1 4.0; do
-	(
-	echo "[gen run start]"
-	pwd
-	date
-	./perceptron -t $threshold -p 2.0 -e 100
-	mv perceptron.scores gen-$NAME-$threshold.scores
-	echo "[gen run end]"
-	pwd
-	date
-	) | tee gen-$NAME-$threshold.out
-	svn revert ../rules/50_scores.cf
-	./rewrite-cf-with-new-scores $SCORESET ../rules/50_scores.cf gen-$NAME-$threshold.scores > /tmp/runGA.$$
-	mv /tmp/runGA.$$ ../rules/50_scores.cf
-	./fp-fn-statistics --ham ham-validate.log --spam spam-validate.log --scoreset $SCORESET > gen-$NAME-$threshold.statistics
-done
+(
+echo "[config]"
+cat config
+echo "[gen run start]"
+pwd
+date
+./perceptron -p $HAM_PREFERENCE -t $THRESHOLD -e $EPOCHS
+mv perceptron.scores $LOGDIR/scores
+echo "[gen run end]"
+) | tee $LOGDIR/log
+svn revert ../rules/50_scores.cf
+./rewrite-cf-with-new-scores $SCORESET ../rules/50_scores.cf $LOGDIR/scores > /tmp/runGA.$$
+mv /tmp/runGA.$$ ../rules/50_scores.cf
+./fp-fn-statistics --ham ham-test.log --spam spam-test.log --scoreset $SCORESET --fnlog $LOGDIR/false_negatives --fplog $LOGDIR/false_positives > $LOGDIR/test
 
 else
 
 # This needs to have 50_scores.cf in place first ...
-echo "[gen validation results]"
-./logs-to-c --spam=SPBASE/spam-validate.log \
-	--ham=NSBASE/ham-validate.log \
-	--count --cffile=../rules --scoreset=$SCORESET | tee gen-$NAME.validate
+echo "[gen test results]"
+./logs-to-c --spam=SPBASE/spam-test.log \
+	--ham=NSBASE/ham-test.log \
+	--count --cffile=../rules --scoreset=$SCORESET | tee $LOGDIR/test
 
 echo "[STATISTICS file generation]"
-./mk-baseline-results $SCORESET | tee gen-$NAME.statistics
+./mk-baseline-results $SCORESET | tee $LOGDIR/statistics
 fi
 
 exit 0

Added: spamassassin/trunk/masses/tenpass/split-log-into-buckets-random
==============================================================================
--- (empty file)
+++ spamassassin/trunk/masses/tenpass/split-log-into-buckets-random	Wed Jul 21 21:07:13 2004
@@ -0,0 +1,58 @@
+#!/usr/bin/perl
+#
+# split-log-into-buckets [n]
+#
+# Split a mass-check log into n identically-sized buckets, evenly
+# taking messages from all checked corpora and preserving comments.
+# It does this evenly by running through all buckets sequentially
+# as each line is read.  n defaults to 10
+
+my $numbuckets = 0;
+if (defined $ARGV[0]) {
+  $numbuckets = $ARGV[0]+0;
+}
+$numbuckets ||= 10;
+
+my %buckets = ();
+foreach my $i (1 .. $numbuckets) {
+  print "Creating split-$i.log\n";
+  open ($buckets{$i}, ">split-$i.log");
+}
+
+while (<STDIN>) {
+  select $buckets{1+int(rand()*$numbuckets)}; print $_;
+}
+
+foreach my $i (1 .. $numbuckets) {
+  close $buckets{$i};
+}
+
+#!/usr/bin/perl
+#
+# split-log-into-buckets [n]
+#
+# Split a mass-check log into n identically-sized buckets, evenly
+# taking messages from all checked corpora and preserving comments.
+# It does this evenly by running through all buckets sequentially
+# as each line is read.  n defaults to 10
+
+my $numbuckets = 0;
+if (defined $ARGV[0]) {
+  $numbuckets = $ARGV[0]+0;
+}
+$numbuckets ||= 10;
+
+my %buckets = ();
+foreach my $i (1 .. $numbuckets) {
+  print "Creating split-$i.log\n";
+  open ($buckets{$i}, ">split-$i.log");
+}
+
+while (<STDIN>) {
+  select $buckets{1+int(rand()*$numbuckets)}; print $_;
+}
+
+foreach my $i (1 .. $numbuckets) {
+  close $buckets{$i};
+}
+

Added: spamassassin/trunk/masses/validate-model
==============================================================================
--- (empty file)
+++ spamassassin/trunk/masses/validate-model	Wed Jul 21 21:07:13 2004
@@ -0,0 +1,252 @@
+#!/bin/sh
+
+# set SCORESET
+. config
+
+RUNS=10
+PASSES=`seq 1 ${RUNS}`
+
+NAME="set$SCORESET"
+LOGDIR="vm-$NAME-$HAM_PREFERENCE-$THRESHOLD-$EPOCHS"
+CACHEDIR="vm-cache/$NAME"
+
+if [ "$NOTE" != "" ]; then
+	LOGDIR="$LOGDIR-$NOTE"
+fi
+
+if [ ! -d $CACHEDIR ]; then
+	mkdir -p $CACHEDIR
+fi
+
+if [ ! -f "ORIG/ham-$NAME.log" -o ! -f "ORIG/spam-$NAME.log" ]; then
+	echo "Couldn't find logs for $NAME" >&2
+	exit 1
+fi
+
+
+echo "[Doing a scoreset $SCORESET score-generation run]"
+
+# clear out the old logs
+rm -rf $LOGDIR
+# Create a directory to organize the logs with this group of settings
+mkdir $LOGDIR
+
+(
+echo "[config]"
+cat config
+) | tee -a $LOGDIR/log
+
+for PASS in $PASSES; do
+	# Clean out old runs
+	echo "[Cleaning up for pass $PASS]"
+	rm -rf spam-validate.log ham-validate.log spam.log ham.log \
+		NSBASE SPBASE tmp freqs perceptron.scores
+	make clean >/dev/null
+
+	# revert to the previous scoring
+	svn revert ../rules/50_scores.cf
+
+	if [ ! -d $CACHEDIR/$PASS ]; then
+		# Generate 90/10 split logs
+		echo "[Generating 90/10 split ham]"
+		mkdir NSBASE SPBASE
+		cd NSBASE
+		../tenpass/split-log-into-buckets 10 < ../ORIG/ham-$NAME.log > /dev/null
+		for p in $PASSES; do
+			if [ "$p" != "$PASS" ]; then
+				cat split-$p.log >> ham.log
+			else
+				mv split-$p.log ham-validate.log
+			fi
+		done
+		rm -f split-*.log
+
+		echo "[Generating 90/10 split spam]"
+		cd ../SPBASE
+		../tenpass/split-log-into-buckets 10 < ../ORIG/spam-$NAME.log > /dev/null
+		for p in $PASSES; do
+			if [ "$p" != "$PASS" ]; then
+				cat split-$p.log >> spam.log
+			else
+				mv split-$p.log spam-validate.log
+			fi
+		done
+		rm -f split-*.log
+		cd ..
+
+		echo "[Setting up for pass $PASS]"
+		# Ok, setup for a run
+		ln -s SPBASE/spam.log .
+		ln -s NSBASE/ham.log .
+		ln -s SPBASE/spam-validate.log .
+		ln -s NSBASE/ham-validate.log .
+
+		# try to find number of processors
+		numcpus=`cpucount 2>/dev/null || egrep -c '^processor\b' /proc/cpuinfo 2>/dev/null || echo 1`
+	else
+		echo "[Retrieving from $CACHEDIR/$PASS]"
+		ln -s $CACHEDIR/$PASS/SPBASE .
+		ln -s $CACHEDIR/$PASS/NSBASE .
+		ln -s $CACHEDIR/$PASS/tmp .
+		ln -s $CACHEDIR/$PASS/freqs .
+
+		ln -s SPBASE/spam.log .
+		ln -s NSBASE/ham.log .
+		ln -s SPBASE/spam-validate.log .
+		ln -s NSBASE/ham-validate.log .
+	fi
+	
+	echo "[Generating perceptron]"
+	# Generate perceptron with full logs
+	make -j $numcpus SCORESET=$SCORESET > $LOGDIR/make.output 2>&1
+
+	(
+	echo "[pass $PASS start]"
+	pwd
+	date
+	./perceptron -p $HAM_PREFERENCE -t $THRESHOLD -e $EPOCHS
+	mv perceptron.scores $LOGDIR/scores.$PASS
+	echo "[pass $PASS end]"
+	) | tee -a $LOGDIR/log
+	./rewrite-cf-with-new-scores $SCORESET ../rules/50_scores.cf $LOGDIR/scores.$PASS > /tmp/runGA.$$
+	mv /tmp/runGA.$$ ../rules/50_scores.cf
+	echo "[evaluating performance]" | tee -a $LOGDIR/log
+	./fp-fn-statistics --ham ham-validate.log --spam spam-validate.log --scoreset $SCORESET --fnlog $LOGDIR/false_negatives.$PASS --fplog $LOGDIR/false_positives.$PASS > $LOGDIR/validate.$PASS
+
+	if [ ! -d $CACHEDIR/$PASS ]; then
+		echo "[Saving object files in $CACHEDIR/$PASS for faster runs]"
+		mkdir -p $CACHEDIR/$PASS
+		mv tmp freqs SPBASE NSBASE $CACHEDIR/$PASS
+	fi
+
+done
+
+./extract-results $LOGDIR/validate.* > $LOGDIR/validate
+
+exit 0
+#!/bin/sh
+
+# set SCORESET
+. config
+
+RUNS=10
+PASSES=`seq 1 ${RUNS}`
+
+NAME="set$SCORESET"
+LOGDIR="vm-$NAME-$HAM_PREFERENCE-$THRESHOLD-$EPOCHS"
+CACHEDIR="vm-cache/$NAME"
+
+if [ "$NOTE" != "" ]; then
+	LOGDIR="$LOGDIR-$NOTE"
+fi
+
+if [ ! -d $CACHEDIR ]; then
+	mkdir -p $CACHEDIR
+fi
+
+if [ ! -f "ORIG/ham-$NAME.log" -o ! -f "ORIG/spam-$NAME.log" ]; then
+	echo "Couldn't find logs for $NAME" >&2
+	exit 1
+fi
+
+
+echo "[Doing a scoreset $SCORESET score-generation run]"
+
+# clear out the old logs
+rm -rf $LOGDIR
+# Create a directory to organize the logs with this group of settings
+mkdir $LOGDIR
+
+(
+echo "[config]"
+cat config
+) | tee -a $LOGDIR/log
+
+for PASS in $PASSES; do
+	# Clean out old runs
+	echo "[Cleaning up for pass $PASS]"
+	rm -rf spam-validate.log ham-validate.log spam.log ham.log \
+		NSBASE SPBASE tmp freqs perceptron.scores
+	make clean >/dev/null
+
+	# revert to the previous scoring
+	svn revert ../rules/50_scores.cf
+
+	if [ ! -d $CACHEDIR/$PASS ]; then
+		# Generate 90/10 split logs
+		echo "[Generating 90/10 split ham]"
+		mkdir NSBASE SPBASE
+		cd NSBASE
+		../tenpass/split-log-into-buckets 10 < ../ORIG/ham-$NAME.log > /dev/null
+		for p in $PASSES; do
+			if [ "$p" != "$PASS" ]; then
+				cat split-$p.log >> ham.log
+			else
+				mv split-$p.log ham-validate.log
+			fi
+		done
+		rm -f split-*.log
+
+		echo "[Generating 90/10 split spam]"
+		cd ../SPBASE
+		../tenpass/split-log-into-buckets 10 < ../ORIG/spam-$NAME.log > /dev/null
+		for p in $PASSES; do
+			if [ "$p" != "$PASS" ]; then
+				cat split-$p.log >> spam.log
+			else
+				mv split-$p.log spam-validate.log
+			fi
+		done
+		rm -f split-*.log
+		cd ..
+
+		echo "[Setting up for pass $PASS]"
+		# Ok, setup for a run
+		ln -s SPBASE/spam.log .
+		ln -s NSBASE/ham.log .
+		ln -s SPBASE/spam-validate.log .
+		ln -s NSBASE/ham-validate.log .
+
+		# try to find number of processors
+		numcpus=`cpucount 2>/dev/null || egrep -c '^processor\b' /proc/cpuinfo 2>/dev/null || echo 1`
+	else
+		echo "[Retrieving from $CACHEDIR/$PASS]"
+		ln -s $CACHEDIR/$PASS/SPBASE .
+		ln -s $CACHEDIR/$PASS/NSBASE .
+		ln -s $CACHEDIR/$PASS/tmp .
+		ln -s $CACHEDIR/$PASS/freqs .
+
+		ln -s SPBASE/spam.log .
+		ln -s NSBASE/ham.log .
+		ln -s SPBASE/spam-validate.log .
+		ln -s NSBASE/ham-validate.log .
+	fi
+	
+	echo "[Generating perceptron]"
+	# Generate perceptron with full logs
+	make -j $numcpus SCORESET=$SCORESET > $LOGDIR/make.output 2>&1
+
+	(
+	echo "[pass $PASS start]"
+	pwd
+	date
+	./perceptron -p $HAM_PREFERENCE -t $THRESHOLD -e $EPOCHS
+	mv perceptron.scores $LOGDIR/scores.$PASS
+	echo "[pass $PASS end]"
+	) | tee -a $LOGDIR/log
+	./rewrite-cf-with-new-scores $SCORESET ../rules/50_scores.cf $LOGDIR/scores.$PASS > /tmp/runGA.$$
+	mv /tmp/runGA.$$ ../rules/50_scores.cf
+	echo "[evaluating performance]" | tee -a $LOGDIR/log
+	./fp-fn-statistics --ham ham-validate.log --spam spam-validate.log --scoreset $SCORESET --fnlog $LOGDIR/false_negatives.$PASS --fplog $LOGDIR/false_positives.$PASS > $LOGDIR/validate.$PASS
+
+	if [ ! -d $CACHEDIR/$PASS ]; then
+		echo "[Saving object files in $CACHEDIR/$PASS for faster runs]"
+		mkdir -p $CACHEDIR/$PASS
+		mv tmp freqs SPBASE NSBASE $CACHEDIR/$PASS
+	fi
+
+done
+
+./extract-results $LOGDIR/validate.* > $LOGDIR/validate
+
+exit 0