You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by ax...@apache.org on 2012/03/12 23:45:17 UTC
svn commit: r1299916 - /spamassassin/trunk/masses/rule-dev/so-display.bin
Author: axb
Date: Mon Mar 12 22:45:17 2012
New Revision: 1299916
URL: http://svn.apache.org/viewvc?rev=1299916&view=rev
Log:
file was missing to make use of JM maildir-scan-headers routines
Added:
spamassassin/trunk/masses/rule-dev/so-display.bin
Added: spamassassin/trunk/masses/rule-dev/so-display.bin
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/rule-dev/so-display.bin?rev=1299916&view=auto
==============================================================================
--- spamassassin/trunk/masses/rule-dev/so-display.bin (added)
+++ spamassassin/trunk/masses/rule-dev/so-display.bin Mon Mar 12 22:45:17 2012
@@ -0,0 +1,66 @@
+#!/usr/bin/perl
+#
+# so-display spamfile hamfile
+# combineddatasource | so-display
+#
+# Compute "S/O ratios" for data. S/O stands for Spam/Overall, and denotes the
+# probability that a hit for that datum is spam (in the Bayesian style).
+#
+# combinedfile should contain lines in the format "X data", where "X" is either
+# "h" or "s" for ham or spam, and "data" is what will be collated and reported.
+#
+# Otherwise "hamfile" and "spamfile" contain data entries, one per line.
+#
+# Feb 11 2003 jm
+
+my $spamdata = shift @ARGV;
+my $hamdata = shift @ARGV;
+my $combined = 0;
+if (!defined $spamdata) { $combined = 1; }
+
+%spam = (); %ham = (); %found = ();
+
+if ($combined) {
+ while (<>) {
+ chomp; s/^(\S+)\s+//;
+ if ($1 eq 's') { $spam{$_}++; } else { $ham{$_}++; }
+ $found{$_}++;
+ }
+
+} else {
+ open (IN, "< $spamdata");
+ while (<IN>) { chomp; $found{$_}++; $spam{$_}++; }
+ close IN;
+ open (IN, "< $hamdata");
+ while (<IN>) { chomp; $found{$_}++; $ham{$_}++; }
+ close IN;
+}
+
+my $stot = 0;
+my $htot = 0;
+foreach my $id (keys %found) {
+ $ham{$id} ||= 0; $spam{$id} ||= 0;
+ $htot += $ham{$id}; $stot += $spam{$id};
+}
+$htot ||= 0.000001;
+$stot ||= 0.000001;
+
+foreach my $id (keys %found) {
+ my $ham = $ham{$id} / $htot;
+ my $spam = $spam{$id} / $stot;
+ my $t = $ham + $spam || 0.000001;
+ $so{$id} = $spam / $t;
+}
+
+printf ("%6s %6s %6s %s\n", "RATIO", "SPAM%", "HAM%", "DATA");
+foreach my $id (sort {
+ $so{$a} <=> $so{$b}
+ || $spam{$a} <=> $spam{$b}
+ || $ham{$b} <=> $ham{$a}
+ } keys %so)
+{
+ printf ("%6.3f %6.3f %6.3f %s\n",
+ $so{$id}, ($spam{$id}*100) / $stot, ($ham{$id}*100) / $htot, $id);
+}
+
+exit;