You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by hs...@apache.org on 2004/08/09 19:17:48 UTC

svn commit: rev 36119 - spamassassin/trunk/masses

Author: hstern
Date: Mon Aug  9 10:17:47 2004
New Revision: 36119

Added:
   spamassassin/trunk/masses/model-statistics   (contents, props changed)
Modified:
   spamassassin/trunk/masses/validate-model
Log:
 * validate-model
 * model-statistics
   Added a display of some statistics about the generated model (mean error
   rates, etc.)



Added: spamassassin/trunk/masses/model-statistics
==============================================================================
--- (empty file)
+++ spamassassin/trunk/masses/model-statistics	Mon Aug  9 10:17:47 2004
@@ -0,0 +1,63 @@
+#!/usr/bin/perl
+
+# This script is used to print some statistics about classification accuracy
+# with a k-fold cross validation
+
+use strict;
+
+my $lambda = 50;  # desired lambda for TCR calculation
+
+if ( scalar(@ARGV) < 1 ) {
+	print STDERR "Usage: model-statistics [validate]\n";
+	exit 1;
+}
+
+my (@fp1, @fn1, @tcr1);
+
+open (FILE, $ARGV[0]) || die $!;
+while (<FILE>) {
+	my @x = split(/\s+/);
+	push (@fp1, $x[2] / ($x[0] + $x[2]));
+	push (@fn1, $x[3] / ($x[1] + $x[3]));
+	push (@tcr1, $x[1] / ($x[3] + $lambda * $x[2]));
+}
+close (FILE);
+
+stat_analysis ("False positives", "pct", \@fp1);
+stat_analysis ("False negatives", "pct", \@fn1);
+stat_analysis ("TCR (lambda=$lambda)", "lin", \@tcr1);
+
+sub stat_analysis {
+	my $title = shift;
+	my $pct = shift;
+	my $s1 = shift;
+
+	# This is the number of degrees of freedom of the two sample sets (i.e.
+	# the number of samples in each set).
+	my $dof = scalar(@{$s1});
+
+	# Compute the mean and standard deviation of the first sample
+	# mean = 1/n * sum(s[i])
+	my $mean_s1 = 0;
+	foreach my $i (1..$dof) {
+		$mean_s1 += $$s1[$i];
+	}
+	$mean_s1 /= $dof;
+
+	# var = 1/(n-1) * sum((mean - s[i])^2)
+	my $var_s1 = 0;
+	foreach my $i (1..$dof) {
+		$var_s1 += ($mean_s1 - $$s1[$i])**2;
+	}
+	$var_s1 /= $dof - 1;
+
+	# std = sqrt(var)
+	my $std_s1 = sqrt($var_s1);
+
+	# SA developers like percentage points instead of probabilities.
+	if ( $pct eq "pct" ) {
+		printf "%s: mean=%0.4f%% std=%0.4f\n",$title,100*$mean_s1,100*$std_s1;
+	} else {
+		printf "%s: mean=%0.4f std=%0.4f\n",$title,$mean_s1,$std_s1;
+	}
+}

Modified: spamassassin/trunk/masses/validate-model
==============================================================================
--- spamassassin/trunk/masses/validate-model	(original)
+++ spamassassin/trunk/masses/validate-model	Mon Aug  9 10:17:47 2004
@@ -123,4 +123,6 @@
 
 ./extract-results $LOGDIR/validate.* > $LOGDIR/validate
 
+./model-statistics $LOGDIR/validate
+
 exit 0