You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2006/08/18 21:05:07 UTC

svn commit: r432683 [3/3] - in /spamassassin/branches/3.1: build/automc/ masses/rule-qa/ masses/rule-qa/automc/

Added: spamassassin/branches/3.1/masses/rule-qa/rule-hits-over-time
URL: http://svn.apache.org/viewvc/spamassassin/branches/3.1/masses/rule-qa/rule-hits-over-time?rev=432683&view=auto
==============================================================================
--- spamassassin/branches/3.1/masses/rule-qa/rule-hits-over-time (added)
+++ spamassassin/branches/3.1/masses/rule-qa/rule-hits-over-time Fri Aug 18 12:05:06 2006
@@ -0,0 +1,532 @@
+#!/usr/bin/perl -w
+#
+# rule-hits-over-time - produce graphs of rule hits over time, using gnuplot
+#
+# <@LICENSE>
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at:
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </...@LICENSE>
+
+use GD;
+use Statistics::DEA;
+
+use strict;
+use warnings;
+use Fcntl;
+use Getopt::Long;
+use SDBM_File;
+
+sub usage {
+  die q{
+usage: rule-hits-over-time [options] --rule rulename log1 [log2 ...]
+
+  --rule=rulename       specify rule to map
+  --period=secs         specify period (default: 1 day)
+  --ignore_older=days   ignore hits older than N days (default: 0 = none)
+  --scale_period=n      scale period up to N items of data, 0=no scaling
+                        (default: 0)
+  --size_x=pixels       width of output graphs, in pixels (def: 800)
+  --size_y=pixels       height of ONE of the output graphs, in pixels
+                        (default: 400)
+  --cgi                 CGI output, to stdout with HTTP headers
+  --text                text output only
+};
+}
+
+use vars qw(
+        $opt_rule $opt_size_x $opt_size_y $opt_text $opt_cgi
+        $opt_period $opt_scale_period $opt_ignore_older
+        $opt_debug
+);
+
+GetOptions(
+        'rule=s',
+        'size_x=i',
+        'size_y=i',
+        'text',
+        'cgi',
+        'scale_period=i',
+        'ignore_older=i',
+        'period=i',
+        'debug'
+) or usage();
+
+usage() unless $opt_rule;
+
+my $DEBUG_TMPDIR = $opt_debug; # keep the tmpdir around after exiting, for debug
+# $DEBUG_TMPDIR = 1;
+
+# fix PATHs for sucky Solaris compatibility.
+$ENV{PATH} = "/local/gnuplot-4.0.0/bin:/opt/sfw/bin:".$ENV{PATH};
+$ENV{LD_LIBRARY_PATH} .= ":/local/gd-2.0.33/lib";
+
+my $rule_re = qr/[, ]${opt_rule}[, ]/;
+
+# my $period = $opt_period || (24 * 60 * 60 * 1);
+my $period = $opt_period || 3600;
+
+my $graph_x                         = $opt_size_x || 800;
+my $graph_y                         = $opt_size_y || 400;
+
+my $fname_counter = 1;
+my %graph_png_data = ();
+
+my %allbuckets = ();
+my %allresults = ();
+my @allfiles = ();
+
+my $graph_times = [];
+my $graph_data = [];
+
+my $this_file_results;
+my $lastbucket;
+my $nextbucket;
+my $seen_y;
+my $seen_n;
+
+my $tmpdir = "/tmp/rulehits.$$";
+if ($DEBUG_TMPDIR) { $tmpdir = "/tmp/rulehits.tmp"; system("rm -rf $tmpdir"); }
+
+mkdir ($tmpdir) or die "collided on $tmpdir";
+
+my $outdir = ".";
+if ($opt_cgi) {
+  $outdir = $tmpdir;
+}
+
+my $file_sets = [ ];    # split into ham and spam
+$file_sets = [ [ 'TITLE:hits in spam' ], [ 'TITLE:hits in ham' ] ];
+
+foreach my $file (@ARGV) {
+  if ($file =~ /\bham\b/) {
+    push @{$file_sets->[1]}, $file;
+  } else {
+    push @{$file_sets->[0]}, $file;
+  }
+}
+
+foreach my $set (@{$file_sets}) {
+  @allfiles = ();
+  %allbuckets = ();
+  %allresults = ();
+
+  my $settitle = '';
+  if ($set->[0] =~ /^TITLE:(.*)$/) {
+    $settitle = $1; shift(@{$set});
+  }
+
+  create_gp("$opt_rule $settitle");
+
+  foreach my $file (@{$set}) {
+    if (!$opt_text) {
+      my $title = $file;
+      $title =~ s/^.*\///;
+    }
+    push (@allfiles, $file);
+
+    if (1) {
+      # use an on-disk file to avoid massive VM usage for this hash
+      # on huge datasets
+      unlink("$tmpdir/graph.tmp.dir");
+      unlink("$tmpdir/graph.tmp.pag");
+      tie (%{$allresults{$file}}, 'SDBM_File', "$tmpdir/graph.tmp",
+                O_RDWR|O_CREAT, 0600) or die "tie failed: $!";
+    }
+    else {
+      %{$allresults{$file}} = ();
+    }
+
+    $this_file_results = $allresults{$file};
+    read_logs($file);
+
+    $graph_times = [];
+    $graph_data = [];
+    summarise();
+  }
+
+  $opt_scale_period and collapse_periods();
+
+  plot_gp();
+}
+
+my $format = "gif";
+
+{
+  my $both = GD::Image->new($graph_x, 15 + ($graph_y * 2));
+  my $file01 = GD::Image->newFromPngData($graph_png_data{"file01"}, 1);
+  my $file02 = GD::Image->newFromPngData($graph_png_data{"file02"}, 1);
+
+  if (!$file01 || !$file02) {
+    warn "bad input.  leaving graph blank";
+  }
+  else {
+    $both->copy($file01, 0, 5, 0, 0, $graph_x-1, $graph_y-1);
+    $both->copy($file02, 0, 10 + $graph_y, 0, 0, $graph_x-1, $graph_y-1);
+  }
+
+  if ($opt_cgi) {
+    use CGI qw(:standard);
+    print header("image/$format"); binmode STDOUT;
+    print STDOUT $both->$format();
+  }
+  else {
+    open(IMG, ">both.$format") or die $!; binmode IMG;
+    print IMG $both->$format();
+    close IMG;
+  }
+
+  $both->gif();
+}
+
+if (!$DEBUG_TMPDIR) {
+  unlink(<$tmpdir/*.*>); rmdir $tmpdir;
+} else {
+  system ("ls -l $tmpdir/*.* 1>&2");
+}
+
+exit;
+
+sub summarise {
+  foreach my $bucket (sort keys %allbuckets) {
+    my @cols = ();
+    foreach my $file (@allfiles) {
+      my $res = $allresults{$file}->{$bucket};
+      my $sy;
+      my $sn;
+
+      if (!$res) {
+        $sn = $sy = -1;
+      }
+      elsif ($res !~ /^y(\d+)n(\d+)$/) {
+        warn "bad results: $res for $file $bucket";
+        next;
+      }
+      else {
+        $sy = $1;
+        $sn = $2;
+      }
+
+      if (!defined $sy && !defined $sn) {
+        $sn = $sy = -1;
+      } elsif (!defined $sy || !defined $sn) {
+        # assert: enforce both < 0, if either is
+        warn "oops? sy=$sy sn=$sn, should be both < 0";
+        $sn = $sy = -1;
+      }
+
+      if (($sy+$sn) > 0) {
+        push @cols, ($sy / ($sy + $sn)) * 100.0;
+      }
+      else {
+        push @cols, -1;
+      }
+    }
+
+    if ($opt_text) {
+      print $bucket," ".join(' ',@cols)."\n";
+    }
+    else {
+      push (@{$graph_times}, $bucket);
+      push (@{$graph_data}, \@cols);
+    }
+  }
+}
+
+
+sub collapse_periods {
+  while (scalar @{$graph_data} > $opt_scale_period) {
+    my $num_files = (scalar @allfiles - 1);
+    my $newtimes = [ ];
+    my $newdata = [ ];
+    my $i;
+    for ($i = 0; $i < (scalar @{$graph_data}); $i += 2) {
+      $newtimes->[$i >> 1] = $graph_times->[$i];
+      foreach my $j (0 .. $num_files)
+      {
+        my $v1 = $graph_data->[$i]->[$j];
+        my $v2 = $graph_data->[$i+1]->[$j];
+        if (!defined $v2) { $v2 = -1; }
+
+        if ($v1 >= 0.0 && $v2 >= 0.0) {
+          # both are valid.  take their mean
+          $v1 = ($v1 + $v2) / 2.0;
+        }
+        elsif ($v2 >= 0.0) {
+          # only one is valid; use it and ignore the invalid one
+          $v1 = $v2;
+        }
+        else {
+          # we're good, v1 is the valid one anyway
+        }
+
+        $newdata->[$i >> 1]->[$j] = $v1;
+      }
+    }
+    @{$graph_times} = @{$newtimes};
+    @{$graph_data} = @{$newdata};
+    $period *= 2;
+  }
+}
+
+
+sub read_logs {
+  my $file = shift;
+
+  # limit to a range from [4 years ago, today] to avoid OOM craziness
+  # from corrupt input
+  #
+  if ($opt_ignore_older <= 0) {
+    $opt_ignore_older = 365 * 4;
+  }
+  my $limit_hi = time;
+  my $limit_lo = $limit_hi - (24*60*60*$opt_ignore_older);
+
+  $lastbucket = undef;
+  $nextbucket = undef;
+  $seen_y = 0;
+  $seen_n = 0;
+
+  if ($file =~ /\.gz$/) {
+    open (IN, "gunzip -cd '$file'|") or die "cannot gunzip $file";
+  } else {
+    open (IN, "<$file") or die "cannot read $file";
+  }
+
+  while (<IN>) {
+    next if /^#/;
+
+    my $t;
+    /\btime=(\d+),/ and $t = $1;
+    next unless $t;
+
+    if ($t < $limit_lo || $t > $limit_hi) {
+      warn "ignoring out-of-range time $t (limit: $limit_lo < t < $limit_hi)";
+      next;
+    }
+
+    my $found = ($_ =~ $rule_re);
+    
+    if (!defined $lastbucket) {
+      $lastbucket = $t - ($t % $period);
+      $nextbucket = $lastbucket + $period;
+    }
+
+    if ($t < $nextbucket) {
+      if ($found) {
+        $seen_y++;
+      } else {
+        $seen_n++;
+      }
+    }
+    else {
+      while ($t >= $nextbucket) {
+        completeline();
+        $lastbucket = $nextbucket;
+        $nextbucket += $period;
+      }
+    }
+  }
+  close IN;
+  completeline();
+}
+
+sub completeline {
+  return unless ($lastbucket);
+  $allbuckets{$lastbucket} = undef;
+  $this_file_results->{$lastbucket} = "y".$seen_y."n".$seen_n;
+  $seen_y = 0;
+  $seen_n = 0;
+}
+
+sub create_gp {
+  my $title = shift;
+
+  my $mailtype = 'mail';
+  if ($title =~ /\b(ham|spam)\b/) { $mailtype = $1; }
+  my $y_label = "\%age of $mailtype in period";
+
+  $SIG{PIPE} = sub {
+            die "unexpected SIGPIPE received!";
+        };
+
+  open (GP, "| gnuplot - > $tmpdir/gp.log 2>&1") or die "cannot run gnuplot";
+
+  # eye-candy
+  my $niceperiod = "$period secs";
+  if ($period % (24*60*60) == 0) {
+    $niceperiod = ($period / (24*60*60))." days";
+  }
+
+  # (NOTE: -1% hitrate means no data for that time period)'
+  print GP qq{
+
+    set terminal png small \\
+        interlace size $graph_x,$graph_y \\
+        xffffff x444444 x33cc00 \\
+        xff3300 x0000cc x99cc00 xff9900 \\
+        xcccc00 x333333 x999999 x9500d3
+
+    set out '$tmpdir/out.png'
+
+    set grid back xtics ytics
+
+    set xlabel 'Time, in blocks of $niceperiod'
+    set xdata time
+    set timefmt "%Y-%m-%d-%H"
+    set format x "%04Y%02m%02d"
+
+    set ylabel '$y_label'
+    set yrange [0:*]
+
+    set title "$title"
+    set key left top Left nobox
+
+  };
+}
+
+sub fmt_time_t {
+  my $tt = shift;
+  use POSIX qw(strftime);
+  return strftime "%Y-%m-%d-%H", gmtime($tt);
+}
+
+sub plot_gp {
+  my $num_files = (scalar @allfiles - 1);
+  my $num_datapoints = (scalar @{$graph_data} - 1);
+
+  # specify a number of alphas for Statistics::DEA.  Right now,
+  # the graph is pretty unreadable with more than one.
+  my $dea_alphas = [ 0.9 ];
+  my $num_alphas = (scalar @{$dea_alphas} - 1);
+
+  my $times = [ ];
+  my $avgs = [ ];
+
+  my $graphname = sprintf("file%02d", $fname_counter++);
+
+  if (!$opt_text)
+  {
+    if (@{$graph_data}) {
+      my $deas = ();
+      foreach my $i (0 .. $num_files) {
+        foreach my $a (0 .. $num_alphas) {
+          $deas->[$a]->[$i] =
+                Statistics::DEA->new($dea_alphas->[$a], $period * 3);
+        }
+      }
+
+      foreach my $j (0 .. $num_datapoints) {
+        my (@datas) = @{$graph_data->[$j]};
+        $times->[$j] = fmt_time_t($graph_times->[$j]);
+
+        foreach my $i (0 .. $num_files) {
+          my $d = $datas[$i];
+
+          foreach my $a (0 .. $num_alphas) {
+            if ($d >= 0) {
+              $deas->[$a]->[$i]->update($d, $j);
+            }
+
+            my $avg;
+            eval {
+              # this can die if it hasn't received enough data!
+              # so trap with an eval.
+              $avg = $deas->[$a]->[$i]->average();
+            };
+            $avgs->[$a]->[$j]->[$i] = (defined $avg) ? $avg : -1;
+          }
+        }
+      }
+    }
+
+    # write the data plotfile
+    open (DATA, ">$tmpdir/plot.$graphname.data") or die;
+    if (@{$graph_data})
+    {
+      foreach my $j (0 .. $num_datapoints) {
+        print DATA $times->[$j]," ",join(' ', @{$graph_data->[$j]}),"\n";
+      }
+    } else {
+      # a fake datapoint so gnuplot doesn't puke on us
+      print DATA fmt_time_t(0)," 0 0\n";
+    }
+    close DATA or die;
+
+
+    # write the avgs plotfiles
+    foreach my $a (0 .. $num_alphas) {
+      open (DATA, ">$tmpdir/avgs$a.$graphname.data") or die;
+      if (@{$graph_data}) {
+        foreach my $j (0 .. $num_datapoints) {
+          print DATA $times->[$j]," ",
+              defined $avgs->[$a]->[$j] ? join ' ', @{$avgs->[$a]->[$j]} : '0',
+              "\n";
+        }
+      } else {
+        # a fake datapoint so gnuplot doesn't puke on us
+        print DATA fmt_time_t(0)," 0 0\n";
+      }
+      close DATA or die;
+    }
+
+
+    # and the commands file
+    my @plot = ();
+    foreach my $i (0 .. $num_files) {
+      my $legend = filename_to_legend ($allfiles[$i]);
+      my $style = $i+1;
+      my $col = $i+2;
+
+      push @plot,
+        qq{ '$tmpdir/plot.$graphname.data' using }.
+            qq{ 1:(\$$col >= 0 ? \$$col : 1/0) }.
+            # note: using "lt $style" gives us points in the same
+            # colour as the lines in the smoothed graph below
+            qq{ with points lt $style pt $style ps 1 }.
+            qq{ title '$legend' };
+
+      foreach my $a (0 .. $num_alphas) {
+        push @plot,
+          qq{ '$tmpdir/avgs$a.$graphname.data' using }.
+              qq{ 1:(\$$col >= 0 ? \$$col : 1/0) }.
+              qq{ with lines lt $style lw 3 }.
+              qq{ title '  (DEA a=$dea_alphas->[$a])' };
+      }
+    }
+
+    print GP "plot ",join(", ", @plot), "\n";
+    close GP
+        or warn "gnuplot command exited: $?";
+
+    $graph_png_data{$graphname} = readfile("$tmpdir/out.png");
+  }
+}
+
+sub readfile {
+  open (IN, "<$_[0]") or die "cannot read $_[0]";
+  binmode IN;
+  my $str = join('',<IN>);
+  close IN;
+  return $str;
+}
+
+sub filename_to_legend {
+  my $f = shift;
+
+  $f =~ s/^.*\///;
+  $f =~ s/LOGS\.all-//;
+  $f =~ s/\.log\.\S+$//;
+  return $f;
+}

Propchange: spamassassin/branches/3.1/masses/rule-qa/rule-hits-over-time
------------------------------------------------------------------------------
    svn:executable = *