You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2006/08/18 21:05:07 UTC
svn commit: r432683 [3/3] - in /spamassassin/branches/3.1: build/automc/
masses/rule-qa/ masses/rule-qa/automc/
Added: spamassassin/branches/3.1/masses/rule-qa/rule-hits-over-time
URL: http://svn.apache.org/viewvc/spamassassin/branches/3.1/masses/rule-qa/rule-hits-over-time?rev=432683&view=auto
==============================================================================
--- spamassassin/branches/3.1/masses/rule-qa/rule-hits-over-time (added)
+++ spamassassin/branches/3.1/masses/rule-qa/rule-hits-over-time Fri Aug 18 12:05:06 2006
@@ -0,0 +1,532 @@
+#!/usr/bin/perl -w
+#
+# rule-hits-over-time - produce graphs of rule hits over time, using gnuplot
+#
+# <@LICENSE>
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at:
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </...@LICENSE>
+
+use GD;
+use Statistics::DEA;
+
+use strict;
+use warnings;
+use Fcntl;
+use Getopt::Long;
+use SDBM_File;
+
+sub usage {
+ die q{
+usage: rule-hits-over-time [options] --rule rulename log1 [log2 ...]
+
+ --rule=rulename specify rule to map
+ --period=secs specify period (default: 1 day)
+ --ignore_older=days ignore hits older than N days (default: 0 = none)
+ --scale_period=n scale period up to N items of data, 0=no scaling
+ (default: 0)
+ --size_x=pixels width of output graphs, in pixels (def: 800)
+ --size_y=pixels height of ONE of the output graphs, in pixels
+ (default: 400)
+ --cgi CGI output, to stdout with HTTP headers
+ --text text output only
+};
+}
+
+use vars qw(
+ $opt_rule $opt_size_x $opt_size_y $opt_text $opt_cgi
+ $opt_period $opt_scale_period $opt_ignore_older
+ $opt_debug
+);
+
+GetOptions(
+ 'rule=s',
+ 'size_x=i',
+ 'size_y=i',
+ 'text',
+ 'cgi',
+ 'scale_period=i',
+ 'ignore_older=i',
+ 'period=i',
+ 'debug'
+) or usage();
+
+usage() unless $opt_rule;
+
+my $DEBUG_TMPDIR = $opt_debug; # keep the tmpdir around after exiting, for debug
+# $DEBUG_TMPDIR = 1;
+
+# fix PATHs for sucky Solaris compatibility.
+$ENV{PATH} = "/local/gnuplot-4.0.0/bin:/opt/sfw/bin:".$ENV{PATH};
+$ENV{LD_LIBRARY_PATH} .= ":/local/gd-2.0.33/lib";
+
+my $rule_re = qr/[, ]${opt_rule}[, ]/;
+
+# my $period = $opt_period || (24 * 60 * 60 * 1);
+my $period = $opt_period || 3600;
+
+my $graph_x = $opt_size_x || 800;
+my $graph_y = $opt_size_y || 400;
+
+my $fname_counter = 1;
+my %graph_png_data = ();
+
+my %allbuckets = ();
+my %allresults = ();
+my @allfiles = ();
+
+my $graph_times = [];
+my $graph_data = [];
+
+my $this_file_results;
+my $lastbucket;
+my $nextbucket;
+my $seen_y;
+my $seen_n;
+
+my $tmpdir = "/tmp/rulehits.$$";
+if ($DEBUG_TMPDIR) { $tmpdir = "/tmp/rulehits.tmp"; system("rm -rf $tmpdir"); }
+
+mkdir ($tmpdir) or die "collided on $tmpdir";
+
+my $outdir = ".";
+if ($opt_cgi) {
+ $outdir = $tmpdir;
+}
+
+my $file_sets = [ ]; # split into ham and spam
+$file_sets = [ [ 'TITLE:hits in spam' ], [ 'TITLE:hits in ham' ] ];
+
+foreach my $file (@ARGV) {
+ if ($file =~ /\bham\b/) {
+ push @{$file_sets->[1]}, $file;
+ } else {
+ push @{$file_sets->[0]}, $file;
+ }
+}
+
+foreach my $set (@{$file_sets}) {
+ @allfiles = ();
+ %allbuckets = ();
+ %allresults = ();
+
+ my $settitle = '';
+ if ($set->[0] =~ /^TITLE:(.*)$/) {
+ $settitle = $1; shift(@{$set});
+ }
+
+ create_gp("$opt_rule $settitle");
+
+ foreach my $file (@{$set}) {
+ if (!$opt_text) {
+ my $title = $file;
+ $title =~ s/^.*\///;
+ }
+ push (@allfiles, $file);
+
+ if (1) {
+ # use an on-disk file to avoid massive VM usage for this hash
+ # on huge datasets
+ unlink("$tmpdir/graph.tmp.dir");
+ unlink("$tmpdir/graph.tmp.pag");
+ tie (%{$allresults{$file}}, 'SDBM_File', "$tmpdir/graph.tmp",
+ O_RDWR|O_CREAT, 0600) or die "tie failed: $!";
+ }
+ else {
+ %{$allresults{$file}} = ();
+ }
+
+ $this_file_results = $allresults{$file};
+ read_logs($file);
+
+ $graph_times = [];
+ $graph_data = [];
+ summarise();
+ }
+
+ $opt_scale_period and collapse_periods();
+
+ plot_gp();
+}
+
+my $format = "gif";
+
+{
+ my $both = GD::Image->new($graph_x, 15 + ($graph_y * 2));
+ my $file01 = GD::Image->newFromPngData($graph_png_data{"file01"}, 1);
+ my $file02 = GD::Image->newFromPngData($graph_png_data{"file02"}, 1);
+
+ if (!$file01 || !$file02) {
+ warn "bad input. leaving graph blank";
+ }
+ else {
+ $both->copy($file01, 0, 5, 0, 0, $graph_x-1, $graph_y-1);
+ $both->copy($file02, 0, 10 + $graph_y, 0, 0, $graph_x-1, $graph_y-1);
+ }
+
+ if ($opt_cgi) {
+ use CGI qw(:standard);
+ print header("image/$format"); binmode STDOUT;
+ print STDOUT $both->$format();
+ }
+ else {
+ open(IMG, ">both.$format") or die $!; binmode IMG;
+ print IMG $both->$format();
+ close IMG;
+ }
+
+ $both->gif();
+}
+
+if (!$DEBUG_TMPDIR) {
+ unlink(<$tmpdir/*.*>); rmdir $tmpdir;
+} else {
+ system ("ls -l $tmpdir/*.* 1>&2");
+}
+
+exit;
+
+sub summarise {
+ foreach my $bucket (sort keys %allbuckets) {
+ my @cols = ();
+ foreach my $file (@allfiles) {
+ my $res = $allresults{$file}->{$bucket};
+ my $sy;
+ my $sn;
+
+ if (!$res) {
+ $sn = $sy = -1;
+ }
+ elsif ($res !~ /^y(\d+)n(\d+)$/) {
+ warn "bad results: $res for $file $bucket";
+ next;
+ }
+ else {
+ $sy = $1;
+ $sn = $2;
+ }
+
+ if (!defined $sy && !defined $sn) {
+ $sn = $sy = -1;
+ } elsif (!defined $sy || !defined $sn) {
+ # assert: enforce both < 0, if either is
+ warn "oops? sy=$sy sn=$sn, should be both < 0";
+ $sn = $sy = -1;
+ }
+
+ if (($sy+$sn) > 0) {
+ push @cols, ($sy / ($sy + $sn)) * 100.0;
+ }
+ else {
+ push @cols, -1;
+ }
+ }
+
+ if ($opt_text) {
+ print $bucket," ".join(' ',@cols)."\n";
+ }
+ else {
+ push (@{$graph_times}, $bucket);
+ push (@{$graph_data}, \@cols);
+ }
+ }
+}
+
+
+sub collapse_periods {
+ while (scalar @{$graph_data} > $opt_scale_period) {
+ my $num_files = (scalar @allfiles - 1);
+ my $newtimes = [ ];
+ my $newdata = [ ];
+ my $i;
+ for ($i = 0; $i < (scalar @{$graph_data}); $i += 2) {
+ $newtimes->[$i >> 1] = $graph_times->[$i];
+ foreach my $j (0 .. $num_files)
+ {
+ my $v1 = $graph_data->[$i]->[$j];
+ my $v2 = $graph_data->[$i+1]->[$j];
+ if (!defined $v2) { $v2 = -1; }
+
+ if ($v1 >= 0.0 && $v2 >= 0.0) {
+ # both are valid. take their mean
+ $v1 = ($v1 + $v2) / 2.0;
+ }
+ elsif ($v2 >= 0.0) {
+ # only one is valid; use it and ignore the invalid one
+ $v1 = $v2;
+ }
+ else {
+ # we're good, v1 is the valid one anyway
+ }
+
+ $newdata->[$i >> 1]->[$j] = $v1;
+ }
+ }
+ @{$graph_times} = @{$newtimes};
+ @{$graph_data} = @{$newdata};
+ $period *= 2;
+ }
+}
+
+
+sub read_logs {
+ my $file = shift;
+
+ # limit to a range from [4 years ago, today] to avoid OOM craziness
+ # from corrupt input
+ #
+ if ($opt_ignore_older <= 0) {
+ $opt_ignore_older = 365 * 4;
+ }
+ my $limit_hi = time;
+ my $limit_lo = $limit_hi - (24*60*60*$opt_ignore_older);
+
+ $lastbucket = undef;
+ $nextbucket = undef;
+ $seen_y = 0;
+ $seen_n = 0;
+
+ if ($file =~ /\.gz$/) {
+ open (IN, "gunzip -cd '$file'|") or die "cannot gunzip $file";
+ } else {
+ open (IN, "<$file") or die "cannot read $file";
+ }
+
+ while (<IN>) {
+ next if /^#/;
+
+ my $t;
+ /\btime=(\d+),/ and $t = $1;
+ next unless $t;
+
+ if ($t < $limit_lo || $t > $limit_hi) {
+ warn "ignoring out-of-range time $t (limit: $limit_lo < t < $limit_hi)";
+ next;
+ }
+
+ my $found = ($_ =~ $rule_re);
+
+ if (!defined $lastbucket) {
+ $lastbucket = $t - ($t % $period);
+ $nextbucket = $lastbucket + $period;
+ }
+
+ if ($t < $nextbucket) {
+ if ($found) {
+ $seen_y++;
+ } else {
+ $seen_n++;
+ }
+ }
+ else {
+ while ($t >= $nextbucket) {
+ completeline();
+ $lastbucket = $nextbucket;
+ $nextbucket += $period;
+ }
+ }
+ }
+ close IN;
+ completeline();
+}
+
+sub completeline {
+ return unless ($lastbucket);
+ $allbuckets{$lastbucket} = undef;
+ $this_file_results->{$lastbucket} = "y".$seen_y."n".$seen_n;
+ $seen_y = 0;
+ $seen_n = 0;
+}
+
+sub create_gp {
+ my $title = shift;
+
+ my $mailtype = 'mail';
+ if ($title =~ /\b(ham|spam)\b/) { $mailtype = $1; }
+ my $y_label = "\%age of $mailtype in period";
+
+ $SIG{PIPE} = sub {
+ die "unexpected SIGPIPE received!";
+ };
+
+ open (GP, "| gnuplot - > $tmpdir/gp.log 2>&1") or die "cannot run gnuplot";
+
+ # eye-candy
+ my $niceperiod = "$period secs";
+ if ($period % (24*60*60) == 0) {
+ $niceperiod = ($period / (24*60*60))." days";
+ }
+
+ # (NOTE: -1% hitrate means no data for that time period)'
+ print GP qq{
+
+ set terminal png small \\
+ interlace size $graph_x,$graph_y \\
+ xffffff x444444 x33cc00 \\
+ xff3300 x0000cc x99cc00 xff9900 \\
+ xcccc00 x333333 x999999 x9500d3
+
+ set out '$tmpdir/out.png'
+
+ set grid back xtics ytics
+
+ set xlabel 'Time, in blocks of $niceperiod'
+ set xdata time
+ set timefmt "%Y-%m-%d-%H"
+ set format x "%04Y%02m%02d"
+
+ set ylabel '$y_label'
+ set yrange [0:*]
+
+ set title "$title"
+ set key left top Left nobox
+
+ };
+}
+
+sub fmt_time_t {
+ my $tt = shift;
+ use POSIX qw(strftime);
+ return strftime "%Y-%m-%d-%H", gmtime($tt);
+}
+
+sub plot_gp {
+ my $num_files = (scalar @allfiles - 1);
+ my $num_datapoints = (scalar @{$graph_data} - 1);
+
+ # specify a number of alphas for Statistics::DEA. Right now,
+ # the graph is pretty unreadable with more than one.
+ my $dea_alphas = [ 0.9 ];
+ my $num_alphas = (scalar @{$dea_alphas} - 1);
+
+ my $times = [ ];
+ my $avgs = [ ];
+
+ my $graphname = sprintf("file%02d", $fname_counter++);
+
+ if (!$opt_text)
+ {
+ if (@{$graph_data}) {
+ my $deas = ();
+ foreach my $i (0 .. $num_files) {
+ foreach my $a (0 .. $num_alphas) {
+ $deas->[$a]->[$i] =
+ Statistics::DEA->new($dea_alphas->[$a], $period * 3);
+ }
+ }
+
+ foreach my $j (0 .. $num_datapoints) {
+ my (@datas) = @{$graph_data->[$j]};
+ $times->[$j] = fmt_time_t($graph_times->[$j]);
+
+ foreach my $i (0 .. $num_files) {
+ my $d = $datas[$i];
+
+ foreach my $a (0 .. $num_alphas) {
+ if ($d >= 0) {
+ $deas->[$a]->[$i]->update($d, $j);
+ }
+
+ my $avg;
+ eval {
+ # this can die if it hasn't received enough data!
+ # so trap with an eval.
+ $avg = $deas->[$a]->[$i]->average();
+ };
+ $avgs->[$a]->[$j]->[$i] = (defined $avg) ? $avg : -1;
+ }
+ }
+ }
+ }
+
+ # write the data plotfile
+ open (DATA, ">$tmpdir/plot.$graphname.data") or die;
+ if (@{$graph_data})
+ {
+ foreach my $j (0 .. $num_datapoints) {
+ print DATA $times->[$j]," ",join(' ', @{$graph_data->[$j]}),"\n";
+ }
+ } else {
+ # a fake datapoint so gnuplot doesn't puke on us
+ print DATA fmt_time_t(0)," 0 0\n";
+ }
+ close DATA or die;
+
+
+ # write the avgs plotfiles
+ foreach my $a (0 .. $num_alphas) {
+ open (DATA, ">$tmpdir/avgs$a.$graphname.data") or die;
+ if (@{$graph_data}) {
+ foreach my $j (0 .. $num_datapoints) {
+ print DATA $times->[$j]," ",
+ defined $avgs->[$a]->[$j] ? join ' ', @{$avgs->[$a]->[$j]} : '0',
+ "\n";
+ }
+ } else {
+ # a fake datapoint so gnuplot doesn't puke on us
+ print DATA fmt_time_t(0)," 0 0\n";
+ }
+ close DATA or die;
+ }
+
+
+ # and the commands file
+ my @plot = ();
+ foreach my $i (0 .. $num_files) {
+ my $legend = filename_to_legend ($allfiles[$i]);
+ my $style = $i+1;
+ my $col = $i+2;
+
+ push @plot,
+ qq{ '$tmpdir/plot.$graphname.data' using }.
+ qq{ 1:(\$$col >= 0 ? \$$col : 1/0) }.
+ # note: using "lt $style" gives us points in the same
+ # colour as the lines in the smoothed graph below
+ qq{ with points lt $style pt $style ps 1 }.
+ qq{ title '$legend' };
+
+ foreach my $a (0 .. $num_alphas) {
+ push @plot,
+ qq{ '$tmpdir/avgs$a.$graphname.data' using }.
+ qq{ 1:(\$$col >= 0 ? \$$col : 1/0) }.
+ qq{ with lines lt $style lw 3 }.
+ qq{ title ' (DEA a=$dea_alphas->[$a])' };
+ }
+ }
+
+ print GP "plot ",join(", ", @plot), "\n";
+ close GP
+ or warn "gnuplot command exited: $?";
+
+ $graph_png_data{$graphname} = readfile("$tmpdir/out.png");
+ }
+}
+
+sub readfile {
+ open (IN, "<$_[0]") or die "cannot read $_[0]";
+ binmode IN;
+ my $str = join('',<IN>);
+ close IN;
+ return $str;
+}
+
+sub filename_to_legend {
+ my $f = shift;
+
+ $f =~ s/^.*\///;
+ $f =~ s/LOGS\.all-//;
+ $f =~ s/\.log\.\S+$//;
+ return $f;
+}
Propchange: spamassassin/branches/3.1/masses/rule-qa/rule-hits-over-time
------------------------------------------------------------------------------
svn:executable = *