You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by ax...@apache.org on 2012/03/12 23:45:17 UTC
svn commit: r1299916 - /spamassassin/trunk/masses/rule-dev/so-display.bin

Author: axb
Date: Mon Mar 12 22:45:17 2012
New Revision: 1299916

URL: http://svn.apache.org/viewvc?rev=1299916&view=rev
Log:
file was missing to make use of JM maildir-scan-headers routines

Added:
    spamassassin/trunk/masses/rule-dev/so-display.bin

Added: spamassassin/trunk/masses/rule-dev/so-display.bin
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/rule-dev/so-display.bin?rev=1299916&view=auto
==============================================================================
--- spamassassin/trunk/masses/rule-dev/so-display.bin (added)
+++ spamassassin/trunk/masses/rule-dev/so-display.bin Mon Mar 12 22:45:17 2012
@@ -0,0 +1,66 @@
+#!/usr/bin/perl
+#
+# so-display spamfile hamfile
+# combineddatasource | so-display 
+#
+# Compute "S/O ratios" for data. S/O stands for Spam/Overall, and denotes the
+# probability that a hit for that datum is spam (in the Bayesian style).
+#
+# combinedfile should contain lines in the format "X data", where "X" is either
+# "h" or "s" for ham or spam, and "data" is what will be collated and reported.
+#
+# Otherwise "hamfile" and "spamfile" contain data entries, one per line.
+# 
+# Feb 11 2003 jm
+
+my $spamdata = shift @ARGV;
+my $hamdata = shift @ARGV;
+my $combined = 0;
+if (!defined $spamdata) { $combined = 1; }
+
+%spam = (); %ham = (); %found = ();
+
+if ($combined) {
+  while (<>) {
+    chomp; s/^(\S+)\s+//;
+    if ($1 eq 's') { $spam{$_}++; } else { $ham{$_}++; }
+    $found{$_}++;
+  }
+
+} else {
+  open (IN, "< $spamdata");
+  while (<IN>) { chomp; $found{$_}++; $spam{$_}++; }
+  close IN;
+  open (IN, "< $hamdata");
+  while (<IN>) { chomp; $found{$_}++; $ham{$_}++; }
+  close IN;
+}
+
+my $stot = 0;
+my $htot = 0;
+foreach my $id (keys %found) {
+  $ham{$id} ||= 0; $spam{$id} ||= 0;
+  $htot += $ham{$id}; $stot += $spam{$id};
+}
+$htot ||= 0.000001;
+$stot ||= 0.000001;
+
+foreach my $id (keys %found) {
+  my $ham = $ham{$id} / $htot;
+  my $spam = $spam{$id} / $stot;
+  my $t = $ham + $spam || 0.000001;
+  $so{$id} = $spam / $t;
+}
+
+printf ("%6s  %6s  %6s   %s\n", "RATIO", "SPAM%", "HAM%", "DATA");
+foreach my $id (sort {
+		     $so{$a} <=> $so{$b}
+		  || $spam{$a} <=> $spam{$b}
+		  || $ham{$b} <=> $ham{$a}
+		} keys %so)
+{
+  printf ("%6.3f  %6.3f  %6.3f   %s\n",
+	$so{$id}, ($spam{$id}*100) / $stot, ($ham{$id}*100) / $htot, $id);
+}
+
+exit;