You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2004/07/22 06:07:14 UTC
svn commit: rev 23133 - in spamassassin/trunk/masses: . tenpass
Author: jm
Date: Wed Jul 21 21:07:13 2004
New Revision: 23133
Added:
spamassassin/trunk/masses/compare-models
spamassassin/trunk/masses/config.set0
spamassassin/trunk/masses/config.set1
spamassassin/trunk/masses/extract-results
spamassassin/trunk/masses/generate-corpus
spamassassin/trunk/masses/tenpass/split-log-into-buckets-random (contents, props changed)
spamassassin/trunk/masses/validate-model
Modified:
spamassassin/trunk/masses/config
spamassassin/trunk/masses/logs-to-c
spamassassin/trunk/masses/mk-baseline-results
spamassassin/trunk/masses/perceptron.c
spamassassin/trunk/masses/runGA
Log:
bug 3584: improvements to score learning system; lots, too many to list here. Henry's patch
Added: spamassassin/trunk/masses/compare-models
==============================================================================
--- (empty file)
+++ spamassassin/trunk/masses/compare-models Wed Jul 21 21:07:13 2004
@@ -0,0 +1,322 @@
+#!/usr/bin/perl
+
+# This script is used to do a statistical comparative analysis of
+
+use Statistics::Distributions;
+use strict;
+
+my $alpha = 0.95; # acceptable confidence in not making a type 1 error
+ # i.e. assume that the means are different
+my $lambda = 50; # desired lambda for TCR calculation
+
+if ( scalar(@ARGV) < 2 ) {
+ print STDERR "Usage: compare-models [validate1] [validate2]\n";
+ exit 1;
+}
+
+my (@fp1, @fn1, @tcr1);
+
+open (FILE, $ARGV[0]) || die $!;
+while (<FILE>) {
+ my @x = split(/\s+/);
+ push (@fp1, $x[2] / ($x[0] + $x[2]));
+ push (@fn1, $x[3] / ($x[1] + $x[3]));
+ push (@tcr1, $x[1] / ($x[3] + $lambda * $x[2]));
+}
+close (FILE);
+
+my (@fp2, @fn2, @tcr2);
+
+open (FILE, $ARGV[1]) || die $!;
+while (<FILE>) {
+ my @x = split(/\s+/);
+ push (@fp2, $x[2] / ($x[0] + $x[2]));
+ push (@fn2, $x[3] / ($x[1] + $x[3]));
+ push (@tcr2, $x[1] / ($x[3] + $lambda * $x[2]));
+}
+close (FILE);
+
+stat_analysis ("False positives", "pct", \@fp1, \@fp2);
+stat_analysis ("False negatives", "pct", \@fn1, \@fn2);
+stat_analysis ("TCR (lambda=$lambda)", "lin", \@tcr1, \@tcr2);
+
+sub stat_analysis {
+ my $title = shift;
+ my $pct = shift;
+ my $s1 = shift;
+ my $s2 = shift;
+
+ unless ( scalar(@$s1) == scalar(@$s1) ) {
+ print STDERR "Can't compute stats for $title. Samples are not paired.\n";
+ return;
+ }
+
+ # This is the number of degrees of freedom of the two sample sets (i.e.
+ # the number of samples in each set).
+ my $dof = scalar(@{$s1});
+
+ print "$title:\n";
+
+ # Compute the mean and standard deviation of the first sample
+ # mean = 1/n * sum(s[i])
+ my $mean_s1 = 0;
+ foreach my $i (1..$dof) {
+ $mean_s1 += $$s1[$i];
+ }
+ $mean_s1 /= $dof;
+
+ # var = 1/(n-1) * sum((mean - s[i])^2)
+ my $var_s1 = 0;
+ foreach my $i (1..$dof) {
+ $var_s1 += ($mean_s1 - $$s1[$i])**2;
+ }
+ $var_s1 /= $dof - 1;
+
+ # std = sqrt(var)
+ my $std_s1 = sqrt($var_s1);
+
+ # Compute the mean and standard deviation of the second sample
+ # mean = 1/n * sum(s[i])
+ my $mean_s2 = 0;
+ foreach my $i (1..$dof) {
+ $mean_s2 += $$s2[$i];
+ }
+ $mean_s2 /= $dof;
+
+ # var = 1/(n-1) * sum((mean - s[i])^2)
+ my $var_s2 = 0;
+ foreach my $i (1..$dof) {
+ $var_s2 += ($mean_s2 - $$s2[$i])**2;
+ }
+ $var_s2 /= $dof - 1;
+
+ # std = sqrt(var)
+ my $std_s2 = sqrt($var_s2);
+
+ # SA developers like percentage points instead of probabilities.
+ if ( $pct eq "pct" ) {
+ printf "\tSample 1: mean=%0.4f%% std=%0.4f\n",100*$mean_s1,100*$std_s1;
+ printf "\tSample 2: mean=%0.4f%% std=%0.4f\n",100*$mean_s2,100*$std_s2;
+ } else {
+ printf "\tSample 1: mean=%0.4f std=%0.4f\n",$mean_s1,$std_s1;
+ printf "\tSample 2: mean=%0.4f std=%0.4f\n",$mean_s2,$std_s2;
+ }
+
+ # Compute the mean of the differences between the two samples
+ my $mean_d = 0;
+ foreach my $i (1..$dof) {
+ $mean_d += $$s1[$i] - $$s2[$i];
+ }
+ $mean_d /= $dof;
+
+ # Compute the variance of the differences between the two samples
+ my $var_d = 0;
+ foreach my $i (1..$dof) {
+ $var_d += ($mean_d - $$s1[$i] + $$s2[$i])**2;
+ }
+ $var_d /= $dof - 1;
+ my $std_d = sqrt($var_d);
+
+ # To determine whether two samples are from the same distribution
+ # (i.e. they have the same mean), we are going to use a paired sample
+ # t-test. You can find more information about this in Tom Mitchell's
+ # "Machine Learning" book.
+
+ # Let t = mean / (var * sqrt(n))
+ my $tstat;
+ if ( $var_d > 0 ) {
+ $tstat = $mean_d / sqrt($var_d / $dof);
+ } else {
+ $tstat = 0;
+ }
+
+ # Now we find the critical value of t for the alpha% confidence
+ # interval.
+ my $tcrit = Statistics::Distributions::tdistr ($dof, (1-$alpha)/2);
+
+ # This is the probability that the two distributions are from different
+ # means.
+ my $tprob = 1-Statistics::Distributions::tprob ($dof, abs($tstat))/2;
+
+ # If the t statistic is less than the critical value, this means that
+ # we should accept the null hypothesis (the distributions have the same
+ # mean), otherwise it should be accepted.
+ if ( abs($tstat) < $tcrit ) {
+ printf "\tNot statistically significantly different (alpha=%0.4f)\n", $alpha;
+ } else {
+ printf "\tStatistically significantly different with confidence %0.4f%%\n", 100*$tprob;
+ }
+
+ # This displays an estimate of the confidence interval around the
+ # estimated mean difference between the two samples. Bear in mind that
+ # the t statistic is working on the local differences and not the
+ # global difference.
+ if ( $pct eq "pct" ) {
+ printf "\tEstimated difference: %0.4f%% +/- %0.4f\n", 100*$mean_d, 100*$std_d*$tcrit;
+ } else {
+ printf "\tEstimated difference: %0.4f +/- %0.4f\n", $mean_d, $std_d*$tcrit;
+ }
+
+ print "\n";
+}
+#!/usr/bin/perl
+
+# This script is used to do a statistical comparative analysis of
+
+use Statistics::Distributions;
+use strict;
+
+my $alpha = 0.95; # acceptable confidence in not making a type 1 error
+ # i.e. assume that the means are different
+my $lambda = 50; # desired lambda for TCR calculation
+
+if ( scalar(@ARGV) < 2 ) {
+ print STDERR "Usage: compare-models [validate1] [validate2]\n";
+ exit 1;
+}
+
+my (@fp1, @fn1, @tcr1);
+
+open (FILE, $ARGV[0]) || die $!;
+while (<FILE>) {
+ my @x = split(/\s+/);
+ push (@fp1, $x[2] / ($x[0] + $x[2]));
+ push (@fn1, $x[3] / ($x[1] + $x[3]));
+ push (@tcr1, $x[1] / ($x[3] + $lambda * $x[2]));
+}
+close (FILE);
+
+my (@fp2, @fn2, @tcr2);
+
+open (FILE, $ARGV[1]) || die $!;
+while (<FILE>) {
+ my @x = split(/\s+/);
+ push (@fp2, $x[2] / ($x[0] + $x[2]));
+ push (@fn2, $x[3] / ($x[1] + $x[3]));
+ push (@tcr2, $x[1] / ($x[3] + $lambda * $x[2]));
+}
+close (FILE);
+
+stat_analysis ("False positives", "pct", \@fp1, \@fp2);
+stat_analysis ("False negatives", "pct", \@fn1, \@fn2);
+stat_analysis ("TCR (lambda=$lambda)", "lin", \@tcr1, \@tcr2);
+
+sub stat_analysis {
+ my $title = shift;
+ my $pct = shift;
+ my $s1 = shift;
+ my $s2 = shift;
+
+ unless ( scalar(@$s1) == scalar(@$s1) ) {
+ print STDERR "Can't compute stats for $title. Samples are not paired.\n";
+ return;
+ }
+
+ # This is the number of degrees of freedom of the two sample sets (i.e.
+ # the number of samples in each set).
+ my $dof = scalar(@{$s1});
+
+ print "$title:\n";
+
+ # Compute the mean and standard deviation of the first sample
+ # mean = 1/n * sum(s[i])
+ my $mean_s1 = 0;
+ foreach my $i (1..$dof) {
+ $mean_s1 += $$s1[$i];
+ }
+ $mean_s1 /= $dof;
+
+ # var = 1/(n-1) * sum((mean - s[i])^2)
+ my $var_s1 = 0;
+ foreach my $i (1..$dof) {
+ $var_s1 += ($mean_s1 - $$s1[$i])**2;
+ }
+ $var_s1 /= $dof - 1;
+
+ # std = sqrt(var)
+ my $std_s1 = sqrt($var_s1);
+
+ # Compute the mean and standard deviation of the second sample
+ # mean = 1/n * sum(s[i])
+ my $mean_s2 = 0;
+ foreach my $i (1..$dof) {
+ $mean_s2 += $$s2[$i];
+ }
+ $mean_s2 /= $dof;
+
+ # var = 1/(n-1) * sum((mean - s[i])^2)
+ my $var_s2 = 0;
+ foreach my $i (1..$dof) {
+ $var_s2 += ($mean_s2 - $$s2[$i])**2;
+ }
+ $var_s2 /= $dof - 1;
+
+ # std = sqrt(var)
+ my $std_s2 = sqrt($var_s2);
+
+ # SA developers like percentage points instead of probabilities.
+ if ( $pct eq "pct" ) {
+ printf "\tSample 1: mean=%0.4f%% std=%0.4f\n",100*$mean_s1,100*$std_s1;
+ printf "\tSample 2: mean=%0.4f%% std=%0.4f\n",100*$mean_s2,100*$std_s2;
+ } else {
+ printf "\tSample 1: mean=%0.4f std=%0.4f\n",$mean_s1,$std_s1;
+ printf "\tSample 2: mean=%0.4f std=%0.4f\n",$mean_s2,$std_s2;
+ }
+
+ # Compute the mean of the differences between the two samples
+ my $mean_d = 0;
+ foreach my $i (1..$dof) {
+ $mean_d += $$s1[$i] - $$s2[$i];
+ }
+ $mean_d /= $dof;
+
+ # Compute the variance of the differences between the two samples
+ my $var_d = 0;
+ foreach my $i (1..$dof) {
+ $var_d += ($mean_d - $$s1[$i] + $$s2[$i])**2;
+ }
+ $var_d /= $dof - 1;
+ my $std_d = sqrt($var_d);
+
+ # To determine whether two samples are from the same distribution
+ # (i.e. they have the same mean), we are going to use a paired sample
+ # t-test. You can find more information about this in Tom Mitchell's
+ # "Machine Learning" book.
+
+ # Let t = mean / (var * sqrt(n))
+ my $tstat;
+ if ( $var_d > 0 ) {
+ $tstat = $mean_d / sqrt($var_d / $dof);
+ } else {
+ $tstat = 0;
+ }
+
+ # Now we find the critical value of t for the alpha% confidence
+ # interval.
+ my $tcrit = Statistics::Distributions::tdistr ($dof, (1-$alpha)/2);
+
+ # This is the probability that the two distributions are from different
+ # means.
+ my $tprob = 1-Statistics::Distributions::tprob ($dof, abs($tstat))/2;
+
+ # If the t statistic is less than the critical value, this means that
+ # we should accept the null hypothesis (the distributions have the same
+ # mean), otherwise it should be accepted.
+ if ( abs($tstat) < $tcrit ) {
+ printf "\tNot statistically significantly different (alpha=%0.4f)\n", $alpha;
+ } else {
+ printf "\tStatistically significantly different with confidence %0.4f%%\n", 100*$tprob;
+ }
+
+ # This displays an estimate of the confidence interval around the
+ # estimated mean difference between the two samples. Bear in mind that
+ # the t statistic is working on the local differences and not the
+ # global difference.
+ if ( $pct eq "pct" ) {
+ printf "\tEstimated difference: %0.4f%% +/- %0.4f\n", 100*$mean_d, 100*$std_d*$tcrit;
+ } else {
+ printf "\tEstimated difference: %0.4f +/- %0.4f\n", $mean_d, $std_d*$tcrit;
+ }
+
+ print "\n";
+}
Modified: spamassassin/trunk/masses/config
==============================================================================
--- spamassassin/trunk/masses/config (original)
+++ spamassassin/trunk/masses/config Wed Jul 21 21:07:13 2004
@@ -1 +1,5 @@
SCORESET=0
+HAM_PREFERENCE=2.0
+THRESHOLD=4.25
+EPOCHS=100
+NOTE=
Added: spamassassin/trunk/masses/config.set0
==============================================================================
--- (empty file)
+++ spamassassin/trunk/masses/config.set0 Wed Jul 21 21:07:13 2004
@@ -0,0 +1,15 @@
+SCORESET=0
+HAM_PREFERENCE=2.0
+THRESHOLD=4.25
+EPOCHS=100
+NOTE=
+SCORESET=0
+HAM_PREFERENCE=2.0
+THRESHOLD=4.25
+EPOCHS=100
+NOTE=
+SCORESET=0
+HAM_PREFERENCE=2.0
+THRESHOLD=4.25
+EPOCHS=100
+NOTE=
Added: spamassassin/trunk/masses/config.set1
==============================================================================
--- (empty file)
+++ spamassassin/trunk/masses/config.set1 Wed Jul 21 21:07:13 2004
@@ -0,0 +1,15 @@
+SCORESET=1
+HAM_PREFERENCE=2.0
+THRESHOLD=4.7
+EPOCHS=100
+NOTE=
+SCORESET=1
+HAM_PREFERENCE=2.0
+THRESHOLD=4.7
+EPOCHS=100
+NOTE=
+SCORESET=1
+HAM_PREFERENCE=2.0
+THRESHOLD=4.7
+EPOCHS=100
+NOTE=
Added: spamassassin/trunk/masses/extract-results
==============================================================================
--- (empty file)
+++ spamassassin/trunk/masses/extract-results Wed Jul 21 21:07:13 2004
@@ -0,0 +1,62 @@
+#!/usr/bin/perl
+
+# This script extracts the confusion matrix (tp, tn, fp, fn) from the output
+# of the logs-to-c program.
+#
+# This is used by the validate-model script to aggregate the results of a
+# cross validation for analysis with the compare-models script.
+
+use strict;
+
+foreach my $file (@ARGV) {
+ open (FILE, "<$file") || die $!;
+
+ my ($tp, $tn, $fp, $fn);
+
+ while (<FILE>) {
+ if ( /Correctly non-spam:\s*(\d+)/ ) {
+ $tn = $1;
+ } elsif ( /Correctly spam:\s*(\d+)/ ) {
+ $tp = $1;
+ } elsif ( /False positives:\s*(\d+)/ ) {
+ $fp = $1;
+ } elsif ( /False negatives:\s*(\d+)/ ) {
+ $fn = $1;
+ }
+ }
+
+ close (FILE);
+
+ printf "%d %d %d %d\n", $tn, $tp, $fp, $fn;
+}
+#!/usr/bin/perl
+
+# This script extracts the confusion matrix (tp, tn, fp, fn) from the output
+# of the logs-to-c program.
+#
+# This is used by the validate-model script to aggregate the results of a
+# cross validation for analysis with the compare-models script.
+
+use strict;
+
+foreach my $file (@ARGV) {
+ open (FILE, "<$file") || die $!;
+
+ my ($tp, $tn, $fp, $fn);
+
+ while (<FILE>) {
+ if ( /Correctly non-spam:\s*(\d+)/ ) {
+ $tn = $1;
+ } elsif ( /Correctly spam:\s*(\d+)/ ) {
+ $tp = $1;
+ } elsif ( /False positives:\s*(\d+)/ ) {
+ $fp = $1;
+ } elsif ( /False negatives:\s*(\d+)/ ) {
+ $fn = $1;
+ }
+ }
+
+ close (FILE);
+
+ printf "%d %d %d %d\n", $tn, $tp, $fp, $fn;
+}
Added: spamassassin/trunk/masses/generate-corpus
==============================================================================
--- (empty file)
+++ spamassassin/trunk/masses/generate-corpus Wed Jul 21 21:07:13 2004
@@ -0,0 +1,24 @@
+# Clean out the old build cache
+rm -rf vm-cache
+
+#find ~/spamassassin/corpus -mtime -7 -name \*.log |egrep -v -- -net | grep ham- | xargs cat > ORIG/ham-set0.log
+#find ~/spamassassin/corpus -mtime -7 -name \*.log |egrep -v -- -net | grep spam- | xargs cat > ORIG/spam-set0.log
+#find ~/spamassassin/corpus -mtime -7 -name \*.log |egrep -- -net | grep ham- | xargs cat > ORIG/ham-set1.log
+#find ~/spamassassin/corpus -mtime -7 -name \*.log |egrep -- -net | grep spam- | xargs cat > ORIG/spam-set1.log
+
+ln -s ham-set1.log ORIG/ham-set0.log
+ln -s spam-set1.log ORIG/spam-set0.log
+cat ~/spamassassin/corpus/submit-3.0.0-sets01/ham-*.log > ORIG/ham-set1.log
+cat ~/spamassassin/corpus/submit-3.0.0-sets01/spam-*.log > ORIG/spam-set1.log
+# Clean out the old build cache
+rm -rf vm-cache
+
+#find ~/spamassassin/corpus -mtime -7 -name \*.log |egrep -v -- -net | grep ham- | xargs cat > ORIG/ham-set0.log
+#find ~/spamassassin/corpus -mtime -7 -name \*.log |egrep -v -- -net | grep spam- | xargs cat > ORIG/spam-set0.log
+#find ~/spamassassin/corpus -mtime -7 -name \*.log |egrep -- -net | grep ham- | xargs cat > ORIG/ham-set1.log
+#find ~/spamassassin/corpus -mtime -7 -name \*.log |egrep -- -net | grep spam- | xargs cat > ORIG/spam-set1.log
+
+ln -s ham-set1.log ORIG/ham-set0.log
+ln -s spam-set1.log ORIG/spam-set0.log
+cat ~/spamassassin/corpus/submit-3.0.0-sets01/ham-*.log > ORIG/ham-set1.log
+cat ~/spamassassin/corpus/submit-3.0.0-sets01/spam-*.log > ORIG/spam-set1.log
Modified: spamassassin/trunk/masses/logs-to-c
==============================================================================
--- spamassassin/trunk/masses/logs-to-c (original)
+++ spamassassin/trunk/masses/logs-to-c Wed Jul 21 21:07:13 2004
@@ -18,9 +18,10 @@
use Getopt::Long;
use vars qw($opt_cffile $opt_count $opt_lambda $opt_threshold
- $opt_spam $opt_ham);
+ $opt_spam $opt_ham $opt_fplog $opt_fnlog);
+
+GetOptions("cffile=s", "count", "lambda=f", "threshold=f", "spam=s", "ham=s", "scoreset=i", "fplog=s", "fnlog=s");
-GetOptions("cffile=s", "count", "lambda=f", "threshold=f", "spam=s", "ham=s", "scoreset=i");
my $argcffile = $opt_cffile;
my $justcount = 0;
@@ -33,6 +34,10 @@
$opt_ham ||= 'ham.log';
$opt_scoreset = 0 if ( !defined $opt_scoreset );
+# If desired, report false positives and false negatives for analysis
+if (defined $opt_fnlog) { open (FNLOG, ">$opt_fnlog"); }
+if (defined $opt_fplog) { open (FPLOG, ">$opt_fplog"); }
+
my $nybias = 10;
# lambda value for TCR equation, representing the cost of of an FP vs. the
@@ -84,6 +89,7 @@
while (<IN>) {
next unless /^[^#]/;
if($_ !~ /^.\s+([-\d]+)\s+(\S+)\s*/) { warn "bad line: $_"; next; }
+ my $msgline = $_;
my $hits = $1;
#my $id = $2;
$_ = $'; s/(?:bayes|time)=\S+//; s/,,+/,/g; s/^\s+//; s/\s+$//;
@@ -118,6 +124,9 @@
$ga_yy++; $yyscore += $score;
} else {
$ga_yn++; $ynscore += $score;
+ if (defined $opt_fnlog) {
+ print FNLOG $msgline;
+ }
}
} else {
$is_spam{$count} = 1;
@@ -128,6 +137,9 @@
if ($score >= $threshold) {
#print STDERR "FP: $id\n";
$ga_ny++; $nyscore += $score;
+ if (defined $opt_fplog) {
+ print FPLOG $msgline;
+ }
} else {
$ga_nn++; $nnscore += $score;
}
Modified: spamassassin/trunk/masses/mk-baseline-results
==============================================================================
--- spamassassin/trunk/masses/mk-baseline-results (original)
+++ spamassassin/trunk/masses/mk-baseline-results Wed Jul 21 21:07:13 2004
@@ -12,7 +12,7 @@
echo "Classification success on test corpora, at default threshold:"
echo
-./logs-to-c --spam=spam-validate.log --ham=ham-validate.log --threshold 5 --count --scoreset=$SCORESET | sed -e 's/^Reading.*//' -e '/^$/d'
+./logs-to-c --spam=spam-test.log --ham=ham-test.log --threshold 5 --count --scoreset=$SCORESET | sed -e 's/^Reading.*//' -e '/^$/d'
echo
echo "Results on test corpora at various alternative thresholds:"
@@ -20,7 +20,7 @@
# list a wide range of thresholds, so that we can make graphs later ;)
for thresh in -4 -3 -2 -1 0 1 2 3 4 4.5 5.5 6 6.5 7 8 9 10 12 15 17 20 ; do
- ./logs-to-c --spam=spam-validate.log --ham=ham-validate.log --threshold $thresh --count --scoreset=$SCORESET | sed -e 's/^Reading.*//' -e '/^$/d'
+ ./logs-to-c --spam=spam-test.log --ham=ham-test.log --threshold $thresh --count --scoreset=$SCORESET | sed -e 's/^Reading.*//' -e '/^$/d'
echo
done
Modified: spamassassin/trunk/masses/perceptron.c
==============================================================================
--- spamassassin/trunk/masses/perceptron.c (original)
+++ spamassassin/trunk/masses/perceptron.c Wed Jul 21 21:07:13 2004
@@ -44,7 +44,7 @@
#endif
#define OUTPUT_FILE "perceptron.scores"
-#define SCORE_RANGES 1
+/* #define IGNORE_SCORE_RANGES 1 */
void init_wheel ();
void destroy_wheel ();
@@ -52,6 +52,7 @@
void init_weights();
void destroy_weights ();
void write_weights (FILE * fp);
+void scale_scores (double old_threshold, double new_threshold);
double evaluate_test (int test);
double evaluate_test_nogain (int test);
void train (int num_epochs, double learning_rate);
@@ -65,7 +66,9 @@
SIZE OF THE ROULETTE_WHEEL ARRAY!). */
int * roulette_wheel; /* Used for roulette wheel selection. */
double ham_preference = 2.0;
-double threshold = 5.0;
+
+#define DEFAULT_THRESHOLD 5.0
+double threshold = DEFAULT_THRESHOLD;
double * weights; /* The weights of the single-layer perceptron. */
double bias; /* The network bias for the single-layer perceptron. */
@@ -227,13 +230,38 @@
for (i = 0; i < num_scores; i++) {
if ( is_mutable[i] ) {
- fprintf(fp, "score %-30s %2.3f\n", score_names[i], weight_to_score(weights[i]));
+ fprintf(fp, "score %-30s %2.3f # [%2.3f..%2.3f]\n", score_names[i], weight_to_score(weights[i]), range_lo[i], range_hi[i]);
} else {
fprintf(fp, "score %-30s %2.3f # not mutable\n", score_names[i], range_lo[i]);
}
}
}
+/* This is to support Daniel's threshold thing. */
+void scale_scores (double old_threshold, double new_threshold) {
+ int i;
+
+ /* No need to scale something to itself. */
+ if ( old_threshold == new_threshold ) {
+ return;
+ }
+
+ for (i = 0; i < num_scores; i++) {
+ if ( is_mutable[i] ) {
+ range_lo[i] = range_lo[i] * new_threshold / old_threshold;
+ range_hi[i] = range_hi[i] * new_threshold / old_threshold;
+ }
+ }
+
+ /* Maybe we don't want this bit. This prescaling stuff makes my
+ * brain hurt.*/
+ /*
+ for (i = 0; i < num_nondup; i++) {
+ scores[i] = scores[i] * new_threshold / old_threshold;
+ }
+ */
+}
+
/* Computes the value of the activation function of the perceptron for
* a given input. */
double evaluate_test (int test) {
@@ -327,12 +355,14 @@
#endif
/* adjust the weights to descend the steepest part of the error gradient */
- bias += delta;
+ if ( epoch + 1 < num_epochs ) {
+ bias += delta;
+ }
for (i = 0; i < num_tests_hit[random_test]; i++) {
int idx = tests_hit[random_test][i];
weights[idx] += delta;
-#ifdef SCORE_RANGES
+#ifdef IGNORE_SCORE_RANGES
/* Constrain the weights so that nice rules are always <= 0 etc. */
if ( range_lo[idx] >= 0 && weights[idx] < 0 ) {
weights[idx] = 0;
@@ -411,6 +441,11 @@
/* Load the instances and score constraints generated by logs-to-c. */
loadtests();
loadscores();
+
+ /* If the threshold has been changed, the ranges and scores need to be
+ * scaled so that the output of the program will not be affected.
+ */
+ scale_scores (DEFAULT_THRESHOLD, threshold);
/* Replicate instances from the training set to bias against false positives. */
init_wheel ();
Modified: spamassassin/trunk/masses/runGA
==============================================================================
--- spamassassin/trunk/masses/runGA (original)
+++ spamassassin/trunk/masses/runGA Wed Jul 21 21:07:13 2004
@@ -4,9 +4,11 @@
. config
NAME="set$SCORESET"
+LOGDIR="gen-$NAME-$HAM_PREFERENCE-$THRESHOLD-$EPOCHS"
-# beware!
-svn revert ../rules/50_scores.cf
+if [ "$NOTE" != "" ]; then
+ LOGDIR="$LOGDIR-$NOTE"
+fi
if [ ! -f "ORIG/ham-$NAME.log" -o ! -f "ORIG/spam-$NAME.log" ]; then
echo "Couldn't find logs for $NAME" >&2
@@ -14,73 +16,77 @@
fi
if [ "x$1" = "x" ]; then
+# This should be in here instead. Prevents testing.
+svn revert ../rules/50_scores.cf
+
echo "[Doing a scoreset $SCORESET score-generation run]"
# Clean out old runs
echo "[Cleaning up]"
-rm -rf spam-validate.log ham-validate.log spam.log ham.log \
+rm -rf spam-test.log ham-test.log spam.log ham.log \
NSBASE SPBASE tmp make.output freqs perceptron.scores \
- gen-$NAME.out gen-$NAME.scores gen-$NAME.validate
+ $LOGDIR
make clean >/dev/null
+# Create a directory to organize the logs with this group of settings
+mkdir $LOGDIR
+
# Generate 90/10 split logs
echo "[Generating 90/10 split ham]"
mkdir NSBASE SPBASE
cd NSBASE
-../tenpass/split-log-into-buckets 10 < ../ORIG/ham-$NAME.log > /dev/null
+../tenpass/split-log-into-buckets-random 10 < ../ORIG/ham-$NAME.log > /dev/null
cat split-[1-9].log > ham.log
rm -f split-[1-9].log
-mv split-10.log ham-validate.log
+mv split-10.log ham-test.log
echo "[Generating 90/10 split spam]"
cd ../SPBASE
-../tenpass/split-log-into-buckets 10 < ../ORIG/spam-$NAME.log > /dev/null
+../tenpass/split-log-into-buckets-random 10 < ../ORIG/spam-$NAME.log > /dev/null
cat split-[1-9].log > spam.log
rm -f split-[1-9].log
-mv split-10.log spam-validate.log
+mv split-10.log spam-test.log
cd ..
echo "[Setting up for gen run]"
# Ok, setup for a run
ln -s SPBASE/spam.log .
ln -s NSBASE/ham.log .
-ln -s SPBASE/spam-validate.log .
-ln -s NSBASE/ham-validate.log .
+ln -s SPBASE/spam-test.log .
+ln -s NSBASE/ham-test.log .
# try to find number of processors
numcpus=`cpucount 2>/dev/null || egrep -c '^processor\b' /proc/cpuinfo 2>/dev/null || echo 1`
echo "[Generating perceptron]"
# Generate perceptron with full logs
-make -j $numcpus SCORESET=$SCORESET > make.output 2>&1
+make -j $numcpus SCORESET=$SCORESET > $LOGDIR/make.output 2>&1
-for threshold in 5.0 4.9 4.8 4.7 4.6 4.5 4.4 4.3 4.2 4.1 4.0; do
- (
- echo "[gen run start]"
- pwd
- date
- ./perceptron -t $threshold -p 2.0 -e 100
- mv perceptron.scores gen-$NAME-$threshold.scores
- echo "[gen run end]"
- pwd
- date
- ) | tee gen-$NAME-$threshold.out
- svn revert ../rules/50_scores.cf
- ./rewrite-cf-with-new-scores $SCORESET ../rules/50_scores.cf gen-$NAME-$threshold.scores > /tmp/runGA.$$
- mv /tmp/runGA.$$ ../rules/50_scores.cf
- ./fp-fn-statistics --ham ham-validate.log --spam spam-validate.log --scoreset $SCORESET > gen-$NAME-$threshold.statistics
-done
+(
+echo "[config]"
+cat config
+echo "[gen run start]"
+pwd
+date
+./perceptron -p $HAM_PREFERENCE -t $THRESHOLD -e $EPOCHS
+mv perceptron.scores $LOGDIR/scores
+echo "[gen run end]"
+) | tee $LOGDIR/log
+svn revert ../rules/50_scores.cf
+./rewrite-cf-with-new-scores $SCORESET ../rules/50_scores.cf $LOGDIR/scores > /tmp/runGA.$$
+mv /tmp/runGA.$$ ../rules/50_scores.cf
+./fp-fn-statistics --ham ham-test.log --spam spam-test.log --scoreset $SCORESET --fnlog $LOGDIR/false_negatives --fplog $LOGDIR/false_positives > $LOGDIR/test
else
# This needs to have 50_scores.cf in place first ...
-echo "[gen validation results]"
-./logs-to-c --spam=SPBASE/spam-validate.log \
- --ham=NSBASE/ham-validate.log \
- --count --cffile=../rules --scoreset=$SCORESET | tee gen-$NAME.validate
+echo "[gen test results]"
+./logs-to-c --spam=SPBASE/spam-test.log \
+ --ham=NSBASE/ham-test.log \
+ --count --cffile=../rules --scoreset=$SCORESET | tee $LOGDIR/test
echo "[STATISTICS file generation]"
-./mk-baseline-results $SCORESET | tee gen-$NAME.statistics
+./mk-baseline-results $SCORESET | tee $LOGDIR/statistics
fi
exit 0
Added: spamassassin/trunk/masses/tenpass/split-log-into-buckets-random
==============================================================================
--- (empty file)
+++ spamassassin/trunk/masses/tenpass/split-log-into-buckets-random Wed Jul 21 21:07:13 2004
@@ -0,0 +1,58 @@
+#!/usr/bin/perl
+#
+# split-log-into-buckets [n]
+#
+# Split a mass-check log into n identically-sized buckets, evenly
+# taking messages from all checked corpora and preserving comments.
+# It does this evenly by running through all buckets sequentially
+# as each line is read. n defaults to 10
+
+my $numbuckets = 0;
+if (defined $ARGV[0]) {
+ $numbuckets = $ARGV[0]+0;
+}
+$numbuckets ||= 10;
+
+my %buckets = ();
+foreach my $i (1 .. $numbuckets) {
+ print "Creating split-$i.log\n";
+ open ($buckets{$i}, ">split-$i.log");
+}
+
+while (<STDIN>) {
+ select $buckets{1+int(rand()*$numbuckets)}; print $_;
+}
+
+foreach my $i (1 .. $numbuckets) {
+ close $buckets{$i};
+}
+
+#!/usr/bin/perl
+#
+# split-log-into-buckets [n]
+#
+# Split a mass-check log into n identically-sized buckets, evenly
+# taking messages from all checked corpora and preserving comments.
+# It does this evenly by running through all buckets sequentially
+# as each line is read. n defaults to 10
+
+my $numbuckets = 0;
+if (defined $ARGV[0]) {
+ $numbuckets = $ARGV[0]+0;
+}
+$numbuckets ||= 10;
+
+my %buckets = ();
+foreach my $i (1 .. $numbuckets) {
+ print "Creating split-$i.log\n";
+ open ($buckets{$i}, ">split-$i.log");
+}
+
+while (<STDIN>) {
+ select $buckets{1+int(rand()*$numbuckets)}; print $_;
+}
+
+foreach my $i (1 .. $numbuckets) {
+ close $buckets{$i};
+}
+
Added: spamassassin/trunk/masses/validate-model
==============================================================================
--- (empty file)
+++ spamassassin/trunk/masses/validate-model Wed Jul 21 21:07:13 2004
@@ -0,0 +1,252 @@
+#!/bin/sh
+
+# set SCORESET
+. config
+
+RUNS=10
+PASSES=`seq 1 ${RUNS}`
+
+NAME="set$SCORESET"
+LOGDIR="vm-$NAME-$HAM_PREFERENCE-$THRESHOLD-$EPOCHS"
+CACHEDIR="vm-cache/$NAME"
+
+if [ "$NOTE" != "" ]; then
+ LOGDIR="$LOGDIR-$NOTE"
+fi
+
+if [ ! -d $CACHEDIR ]; then
+ mkdir -p $CACHEDIR
+fi
+
+if [ ! -f "ORIG/ham-$NAME.log" -o ! -f "ORIG/spam-$NAME.log" ]; then
+ echo "Couldn't find logs for $NAME" >&2
+ exit 1
+fi
+
+
+echo "[Doing a scoreset $SCORESET score-generation run]"
+
+# clear out the old logs
+rm -rf $LOGDIR
+# Create a directory to organize the logs with this group of settings
+mkdir $LOGDIR
+
+(
+echo "[config]"
+cat config
+) | tee -a $LOGDIR/log
+
+for PASS in $PASSES; do
+ # Clean out old runs
+ echo "[Cleaning up for pass $PASS]"
+ rm -rf spam-validate.log ham-validate.log spam.log ham.log \
+ NSBASE SPBASE tmp freqs perceptron.scores
+ make clean >/dev/null
+
+ # revert to the previous scoring
+ svn revert ../rules/50_scores.cf
+
+ if [ ! -d $CACHEDIR/$PASS ]; then
+ # Generate 90/10 split logs
+ echo "[Generating 90/10 split ham]"
+ mkdir NSBASE SPBASE
+ cd NSBASE
+ ../tenpass/split-log-into-buckets 10 < ../ORIG/ham-$NAME.log > /dev/null
+ for p in $PASSES; do
+ if [ "$p" != "$PASS" ]; then
+ cat split-$p.log >> ham.log
+ else
+ mv split-$p.log ham-validate.log
+ fi
+ done
+ rm -f split-*.log
+
+ echo "[Generating 90/10 split spam]"
+ cd ../SPBASE
+ ../tenpass/split-log-into-buckets 10 < ../ORIG/spam-$NAME.log > /dev/null
+ for p in $PASSES; do
+ if [ "$p" != "$PASS" ]; then
+ cat split-$p.log >> spam.log
+ else
+ mv split-$p.log spam-validate.log
+ fi
+ done
+ rm -f split-*.log
+ cd ..
+
+ echo "[Setting up for pass $PASS]"
+ # Ok, setup for a run
+ ln -s SPBASE/spam.log .
+ ln -s NSBASE/ham.log .
+ ln -s SPBASE/spam-validate.log .
+ ln -s NSBASE/ham-validate.log .
+
+ # try to find number of processors
+ numcpus=`cpucount 2>/dev/null || egrep -c '^processor\b' /proc/cpuinfo 2>/dev/null || echo 1`
+ else
+ echo "[Retrieving from $CACHEDIR/$PASS]"
+ ln -s $CACHEDIR/$PASS/SPBASE .
+ ln -s $CACHEDIR/$PASS/NSBASE .
+ ln -s $CACHEDIR/$PASS/tmp .
+ ln -s $CACHEDIR/$PASS/freqs .
+
+ ln -s SPBASE/spam.log .
+ ln -s NSBASE/ham.log .
+ ln -s SPBASE/spam-validate.log .
+ ln -s NSBASE/ham-validate.log .
+ fi
+
+ echo "[Generating perceptron]"
+ # Generate perceptron with full logs
+ make -j $numcpus SCORESET=$SCORESET > $LOGDIR/make.output 2>&1
+
+ (
+ echo "[pass $PASS start]"
+ pwd
+ date
+ ./perceptron -p $HAM_PREFERENCE -t $THRESHOLD -e $EPOCHS
+ mv perceptron.scores $LOGDIR/scores.$PASS
+ echo "[pass $PASS end]"
+ ) | tee -a $LOGDIR/log
+ ./rewrite-cf-with-new-scores $SCORESET ../rules/50_scores.cf $LOGDIR/scores.$PASS > /tmp/runGA.$$
+ mv /tmp/runGA.$$ ../rules/50_scores.cf
+ echo "[evaluating performance]" | tee -a $LOGDIR/log
+ ./fp-fn-statistics --ham ham-validate.log --spam spam-validate.log --scoreset $SCORESET --fnlog $LOGDIR/false_negatives.$PASS --fplog $LOGDIR/false_positives.$PASS > $LOGDIR/validate.$PASS
+
+ if [ ! -d $CACHEDIR/$PASS ]; then
+ echo "[Saving object files in $CACHEDIR/$PASS for faster runs]"
+ mkdir -p $CACHEDIR/$PASS
+ mv tmp freqs SPBASE NSBASE $CACHEDIR/$PASS
+ fi
+
+done
+
+./extract-results $LOGDIR/validate.* > $LOGDIR/validate
+
+exit 0
+#!/bin/sh
+
+# set SCORESET
+. config
+
+RUNS=10
+PASSES=`seq 1 ${RUNS}`
+
+NAME="set$SCORESET"
+LOGDIR="vm-$NAME-$HAM_PREFERENCE-$THRESHOLD-$EPOCHS"
+CACHEDIR="vm-cache/$NAME"
+
+if [ "$NOTE" != "" ]; then
+ LOGDIR="$LOGDIR-$NOTE"
+fi
+
+if [ ! -d $CACHEDIR ]; then
+ mkdir -p $CACHEDIR
+fi
+
+if [ ! -f "ORIG/ham-$NAME.log" -o ! -f "ORIG/spam-$NAME.log" ]; then
+ echo "Couldn't find logs for $NAME" >&2
+ exit 1
+fi
+
+
+echo "[Doing a scoreset $SCORESET score-generation run]"
+
+# clear out the old logs
+rm -rf $LOGDIR
+# Create a directory to organize the logs with this group of settings
+mkdir $LOGDIR
+
+(
+echo "[config]"
+cat config
+) | tee -a $LOGDIR/log
+
+for PASS in $PASSES; do
+ # Clean out old runs
+ echo "[Cleaning up for pass $PASS]"
+ rm -rf spam-validate.log ham-validate.log spam.log ham.log \
+ NSBASE SPBASE tmp freqs perceptron.scores
+ make clean >/dev/null
+
+ # revert to the previous scoring
+ svn revert ../rules/50_scores.cf
+
+ if [ ! -d $CACHEDIR/$PASS ]; then
+ # Generate 90/10 split logs
+ echo "[Generating 90/10 split ham]"
+ mkdir NSBASE SPBASE
+ cd NSBASE
+ ../tenpass/split-log-into-buckets 10 < ../ORIG/ham-$NAME.log > /dev/null
+ for p in $PASSES; do
+ if [ "$p" != "$PASS" ]; then
+ cat split-$p.log >> ham.log
+ else
+ mv split-$p.log ham-validate.log
+ fi
+ done
+ rm -f split-*.log
+
+ echo "[Generating 90/10 split spam]"
+ cd ../SPBASE
+ ../tenpass/split-log-into-buckets 10 < ../ORIG/spam-$NAME.log > /dev/null
+ for p in $PASSES; do
+ if [ "$p" != "$PASS" ]; then
+ cat split-$p.log >> spam.log
+ else
+ mv split-$p.log spam-validate.log
+ fi
+ done
+ rm -f split-*.log
+ cd ..
+
+ echo "[Setting up for pass $PASS]"
+ # Ok, setup for a run
+ ln -s SPBASE/spam.log .
+ ln -s NSBASE/ham.log .
+ ln -s SPBASE/spam-validate.log .
+ ln -s NSBASE/ham-validate.log .
+
+ # try to find number of processors
+ numcpus=`cpucount 2>/dev/null || egrep -c '^processor\b' /proc/cpuinfo 2>/dev/null || echo 1`
+ else
+ echo "[Retrieving from $CACHEDIR/$PASS]"
+ ln -s $CACHEDIR/$PASS/SPBASE .
+ ln -s $CACHEDIR/$PASS/NSBASE .
+ ln -s $CACHEDIR/$PASS/tmp .
+ ln -s $CACHEDIR/$PASS/freqs .
+
+ ln -s SPBASE/spam.log .
+ ln -s NSBASE/ham.log .
+ ln -s SPBASE/spam-validate.log .
+ ln -s NSBASE/ham-validate.log .
+ fi
+
+ echo "[Generating perceptron]"
+ # Generate perceptron with full logs
+ make -j $numcpus SCORESET=$SCORESET > $LOGDIR/make.output 2>&1
+
+ (
+ echo "[pass $PASS start]"
+ pwd
+ date
+ ./perceptron -p $HAM_PREFERENCE -t $THRESHOLD -e $EPOCHS
+ mv perceptron.scores $LOGDIR/scores.$PASS
+ echo "[pass $PASS end]"
+ ) | tee -a $LOGDIR/log
+ ./rewrite-cf-with-new-scores $SCORESET ../rules/50_scores.cf $LOGDIR/scores.$PASS > /tmp/runGA.$$
+ mv /tmp/runGA.$$ ../rules/50_scores.cf
+ echo "[evaluating performance]" | tee -a $LOGDIR/log
+ ./fp-fn-statistics --ham ham-validate.log --spam spam-validate.log --scoreset $SCORESET --fnlog $LOGDIR/false_negatives.$PASS --fplog $LOGDIR/false_positives.$PASS > $LOGDIR/validate.$PASS
+
+ if [ ! -d $CACHEDIR/$PASS ]; then
+ echo "[Saving object files in $CACHEDIR/$PASS for faster runs]"
+ mkdir -p $CACHEDIR/$PASS
+ mv tmp freqs SPBASE NSBASE $CACHEDIR/$PASS
+ fi
+
+done
+
+./extract-results $LOGDIR/validate.* > $LOGDIR/validate
+
+exit 0