You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2005/10/27 08:38:48 UTC
svn commit: r328806 - /spamassassin/trunk/masses/rule-hits-over-time

Author: jm
Date: Wed Oct 26 23:38:45 2005
New Revision: 328806

URL: http://svn.apache.org/viewcvs?rev=328806&view=rev
Log:
major improvements to the rule-hits graphing script

Modified:
    spamassassin/trunk/masses/rule-hits-over-time

Modified: spamassassin/trunk/masses/rule-hits-over-time
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/masses/rule-hits-over-time?rev=328806&r1=328805&r2=328806&view=diff
==============================================================================
--- spamassassin/trunk/masses/rule-hits-over-time (original)
+++ spamassassin/trunk/masses/rule-hits-over-time Wed Oct 26 23:38:45 2005
@@ -1,43 +1,329 @@
-#!/usr/bin/perl
+#!/usr/bin/perl -w
+#
+# rule-hits-over-time - produce graphs of rule hits over time, using GD::Graph
+#
+# <@LICENSE>
+# Copyright 2004 Apache Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </...@LICENSE>
 
-my $lastbucket = 0;
-my $nextbucket = 0;
-my $PERIOD = (24 * 60 * 60 * 7);
-
-while (<>) {
-  my $found = 0;
-  /SARE_SUBJ/ and $found = 1;
-  s/^.*\btime=//; s/,.*$//;
-  
-  my $t = $_;
-  if ($lastbucket == 0) {
-    $lastbucket = $t;
-    $nextbucket = $t + $PERIOD;    # plus 2 hrs
-  }
-  if ($t < $nextbucket) {
-    if ($found) {
-      $seen_y++;
+use Getopt::Long;
+use SDBM_File;
+use GD;
+use GD::Graph;
+
+use strict;
+use warnings;
+use Fcntl;
+
+sub usage {
+  die q{
+usage: rule-hits-over-time [options] --rule rulename log1 [log2 ...]
+
+  --rule=rulename       specify rule to map
+  --period=secs         specify period (default: 1 day)
+  --size_x=pixels       width of output graphs, in pixels (def: 800)
+  --size_y=pixels       height of ONE of the output graphs, in pixels
+                        (default: 400)
+  --as_counts           Do not scale to a percentage of messages;
+                        report absolute messages hit per time period
+  --cgi                 CGI output, to stdout with HTTP headers
+  --text                text output only
+};
+}
+
+use vars qw(
+        $opt_rule $opt_size_x $opt_size_y $opt_text $opt_cgi
+        $opt_period $opt_as_counts
+);
+
+GetOptions(
+        'rule=s',
+        'size_x=i',
+        'size_y=i',
+        'text',
+        'cgi',
+        'as_counts',
+        'period=i',
+) or usage();
+
+usage() unless $opt_rule;
+
+my $rule_re = qr/[, ]${opt_rule}[, ]/;
+
+my $period = $opt_period || (24 * 60 * 60 * 1);
+
+my $graph_x                         = $opt_size_x || 800;
+my $graph_y                         = $opt_size_y || 400;
+my $scale_to_total_volume           = ($opt_as_counts ? 0 : 1);
+my $graph_files_individually        = 0;    # or as ham & spam sets
+# my $y_ceiling                     = 3000; # mails per $period
+
+my $fname_counter = 1;
+my %allbuckets = ();
+my %allresults = ();
+my @allfiles = ();
+
+my $gd;
+my $graph_data;
+my $this_file_results;
+
+my $lastbucket;
+my $nextbucket;
+my $seen_y;
+my $seen_n;
+
+my $tmpdir = "/tmp/rulehits.$$";
+if ($opt_cgi) {
+  mkdir ($tmpdir) or die "collided on $tmpdir";
+  chdir ($tmpdir);
+}
+
+my $file_sets = [ ];    # split into ham and spam
+
+if (!$graph_files_individually) {
+  $file_sets = [ [ 'TITLE:hits in ham' ], [ 'TITLE:hits in spam' ] ];
+}
+
+foreach my $file (@ARGV) {
+  if ($graph_files_individually) {
+    push @{$file_sets}, [ $file ];
+  }
+  else {
+    if ($file =~ /ham/) {
+      push @{$file_sets->[0]}, $file;
     } else {
-      $seen_n++;
+      push @{$file_sets->[1]}, $file;
     }
   }
-  else {
-    while ($t >= $nextbucket) {
-      completeline();
-      $lastbucket = $nextbucket;
-      $nextbucket += $PERIOD;
+}
+
+foreach my $set (@{$file_sets}) {
+  @allfiles = ();
+  %allbuckets = ();
+  %allresults = ();
+  @allfiles = ();
+
+  my $settitle = '';
+  if ($set->[0] =~ /^TITLE:(.*)$/) {
+    $settitle = $1; shift(@{$set});
+  }
+  create_gd("$opt_rule $settitle");
+
+  foreach my $file (@{$set}) {
+    if (!$opt_text) {
+      my $title = $file;
+      $title =~ s/^.*\///;
+    }
+    push (@allfiles, $file);
+
+    if (1) {
+      # use an on-disk file to avoid massive VM usage for this hash
+      # on huge datasets
+      unlink("graph.tmp.dir");
+      unlink("graph.tmp.pag");
+      tie (%{$allresults{$file}}, 'SDBM_File', 'graph.tmp', O_RDWR|O_CREAT, 0600)
+            or die "tie failed: $!";
     }
+    else {
+      %{$allresults{$file}} = ();
+    }
+
+    $this_file_results = $allresults{$file};
+    read_logs($file);
+
+    $graph_data = GD::Graph::Data->new();
+    summarise();
   }
+
+  plot_gd();
 }
 
-sub completeline {
-  print "$lastbucket $seen_y $seen_n\n";
+if (!$graph_files_individually) {
+  system ("convert -append file01.gif file02.gif both.gif");
+}
+
+if ($opt_cgi) {
+  use CGI qw(:standard);
+  my $format = $gd->export_format;
+  print header("image/$format");
+  binmode STDOUT;
+  open (IN, "<both.gif") or die "no both.gif";
+  binmode IN;
+  while (<IN>) { print STDOUT; }
+  close IN;
+}
+
+if ($opt_cgi) {
+  system ("cd /; rm -rf $tmpdir");      # clean up tmp files
+}
+exit;
+
+sub summarise {
+  foreach my $bucket (sort keys %allbuckets) {
+    my $total_n = 0;
+    my @cols = ();
+    foreach my $file (@allfiles) {
+      my $seen_y = $allresults{$file}->{"y".$bucket} || 0;
+      my $seen_n = $allresults{$file}->{"n".$bucket} || 0;
+      if ($scale_to_total_volume) {
+        my $frac = $seen_y / (($seen_y + $seen_n) || 0.0001);
+        push @cols, ($frac * 100.0);
+        $total_n = 100;
+      }
+      else {
+        $total_n += $seen_n;
+        # if ($y_ceiling && $seen_y > $y_ceiling) { $seen_y = $y_ceiling; }
+        push (@cols, $seen_y);
+      }
+    }
+
+    if ($scale_to_total_volume) {
+      @cols = ($bucket, @cols);     # total_n is always "100"
+    } else {
+      # if ($y_ceiling && $total_n > $y_ceiling) { $total_n = $y_ceiling; }
+      @cols = ($bucket, $total_n, @cols);
+    }
+
+    if ($opt_text) {
+      print join(' ',@cols)."\n";
+    }
+    else {
+      $graph_data->add_point(@cols);
+    }
+  }
+}
+
+
+sub read_logs {
+  my $file = shift;
+
+  $lastbucket = undef;
+  $nextbucket = undef;
   $seen_y = 0;
   $seen_n = 0;
+
+  open (IN, "<$file") or die "cannot read $file";
+  while (<IN>) {
+    next if /^#/;
+
+    my $t;
+    /\btime=(\d+),/ and $t = $1;
+    next unless $t;
+
+    my $found = ($_ =~ $rule_re);
+    
+    if (!defined $lastbucket) {
+      $lastbucket = $t - ($t % $period);
+      $nextbucket = $lastbucket + $period;
+    }
+
+    if ($t < $nextbucket) {
+      if ($found) {
+        $seen_y++;
+      } else {
+        $seen_n++;
+      }
+    }
+    else {
+      while ($t >= $nextbucket) {
+        completeline();
+        $lastbucket = $nextbucket;
+        $nextbucket += $period;
+      }
+    }
+  }
+  close IN;
+  completeline();
 }
 
-print STDERR '
+sub completeline {
+  $allbuckets{$lastbucket} = undef;
+  $this_file_results->{"y".$lastbucket} = $seen_y; $seen_y = 0;
+  $this_file_results->{"n".$lastbucket} = $seen_n; $seen_n = 0;
+}
 
-plot "times" using 0:1, "times" using 0:2
 
-';
+sub create_gd {
+  my $title = shift;
+
+  use GD::Graph::lines;
+  $gd = GD::Graph::lines->new($graph_x, $graph_y);
+  $gd->set (
+      title => $title,
+      box_axis => 1,
+      # show_values => 1,
+
+      bgclr => "#ffffff",
+      fgclr => "#000000",
+      boxclr => "#fdfdfd",
+      labelclr => "#000000",
+
+      dclrs => [
+        "#33cc00",  # green
+        "#ff3300",  # red
+        "#0000cc",  # blue
+        "#99cc00",  # mauve
+        "#ff9900",  # orange
+        "#cccc00",  # yellowish
+        "#333333",  # dark grey
+        "#999999"   # light grey
+      ],
+      r_margin => 20,
+
+      y_label => ($scale_to_total_volume ?
+            "\%age of mail in period" : "Hits in period"),
+
+      zero_axis => 1,
+
+      # x_label => "Time (in blocks of $period secs)",
+      x_labels_vertical => 0,
+      x_tick_number => 'auto',
+      x_number_format => \&fmt_time_t,
+  );
+
+  if ($scale_to_total_volume) {
+    $gd->set (
+      y_min_value => 0,
+      y_max_value => 100,
+    );
+  }
+}
+
+sub fmt_time_t {
+  my $tt = shift;
+
+  use POSIX qw(strftime);
+  return strftime "%b %e %Y", gmtime($tt);
+}
+
+sub plot_gd {
+  if ($opt_text) {
+    print STDERR '
+
+    plot "times" using 0:1, "times" using 0:2
+
+    ';
+  }
+  else {
+    $gd->plot($graph_data);
+    my $format = $gd->export_format;
+    my $fname = sprintf("file%02d.%s", $fname_counter++, $format);
+    open(IMG, ">$fname") or die $!;
+    binmode IMG;
+    print IMG $gd->gd()->$format();
+    close IMG;
+
+  }
+}