You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by fe...@apache.org on 2006/12/03 05:13:10 UTC

svn commit: r481710 - in /spamassassin/trunk: lib/Mail/SpamAssassin/ArchiveIterator.pm masses/mass-check sa-learn.raw spamassassin.raw tools/split_corpora

Author: felicity
Date: Sat Dec  2 20:13:09 2006
New Revision: 481710

URL: http://svn.apache.org/viewvc?view=rev&rev=481710
Log:
move more code from ArchiveIterator to mass-check, since it's the only script we have that uses the code.  pass bookkeeping function to the scan_* functions so that the AI caller can control the message data storage, ala bug 5141.

Modified:
    spamassassin/trunk/lib/Mail/SpamAssassin/ArchiveIterator.pm
    spamassassin/trunk/masses/mass-check
    spamassassin/trunk/sa-learn.raw
    spamassassin/trunk/spamassassin.raw
    spamassassin/trunk/tools/split_corpora

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/ArchiveIterator.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/ArchiveIterator.pm?view=diff&rev=481710&r1=481709&r2=481710
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/ArchiveIterator.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/ArchiveIterator.pm Sat Dec  2 20:13:09 2006
@@ -34,6 +34,7 @@
 use vars qw {
   $MESSAGES
   $AICache
+  %class_opts
   @ISA
 };
 
@@ -93,42 +94,6 @@
 Typically messages over 250k are skipped by ArchiveIterator.  Use this option
 to keep from skipping messages based on size.
 
-=item opt_n
-
-ArchiveIterator is typically used to simulate ham and spam moving through
-SpamAssassin.  By default, the list of messages is sorted by received date so
-that the mails can be passed through in order.  If opt_n is true, the sorting
-will not occur.  This is useful if you don't care about the order of the
-messages.
-
-=item opt_head
-
-Only use the first N ham and N spam (or if the value is -N, only use the first
-N total messages regardless of class).
-
-This setting can be specified separately for ham and spam target classes.
-If multiple targets for one class are specified with different
-options, the last target's options will be used.
-
-If the value is negative, and multiple targets are specified with different
-options, the last spam target's setting will be used.
-
-=item opt_tail
-
-Only use the last N ham and N spam (or if the value is -N, only use the last
-N total messages regardless of class).
-
-If both C<opt_head> and C<opt_tail> are specified, then the C<opt_head> value
-specifies a subset of the C<opt_tail> selection to use; in other words, the
-C<opt_tail> splice is applied first.
-
-This setting can be specified separately for ham and spam target classes.
-If multiple targets for one class are specified with different
-options, the last target's options will be used.
-
-If the value is negative, and multiple targets are specified with different
-options, the last spam target's setting will be used.
-
 =item opt_scanprob
 
 Randomly select messages to scan, with a probability of N, where N ranges
@@ -252,9 +217,9 @@
 following format: C<class:format:raw_location>, or a hash reference containing
 key-value option pairs and a 'target' key with a value in that format.
 
-The key-value option pairs that can be used are: opt_head, opt_tail,
-opt_scanprob, opt_after, opt_before.  See the constructor method's
-documentation for more information on their effects.
+The key-value option pairs that can be used are: opt_scanprob, opt_after,
+opt_before.  See the constructor method's documentation for more information
+on their effects.
 
 run() returns 0 if there was an error (can't open a file, etc,) and 1 if there
 were no errors.
@@ -297,10 +262,22 @@
     return 0;
   }
 
+  # scan the targets and get the number and list of messages
+  $self->scan_targets(\@targets,
+    sub {
+      my($self, $date, $class, $format, $mail) = @_;
+      push(@{$self->{$class}}, index_pack($date, $class, $format, $mail));
+    }
+  );
+
   my $messages;
+  # for ease of memory, we'll play with pointers
+  $messages = $self->{s};
+  undef $self->{s};
+  push(@{$messages}, @{$self->{h}});
+  undef $self->{h};
 
-  # scan the targets and get the number and list of messages
-  ($MESSAGES, $messages) = $self->message_array(\@targets);
+  $MESSAGES = scalar(@{$messages});
 
   # go ahead and run through all of the messages specified
   return $self->_run($messages);
@@ -446,11 +423,10 @@
 
 ############################################################################
 
-# TODO: this needs POD since mass-check uses it?
-sub message_array {
-  my ($self, $targets) = @_;
+sub scan_targets {
+  my ($self, $targets, $bkfunc) = @_;
 
-  my %class_opts = ();
+  %class_opts = ();
 
   foreach my $target (@${targets}) {
     if (!defined $target) {
@@ -504,83 +480,39 @@
 	# detect the format
         if (!-d $location && $location =~ /\.mbox/i) {
           # filename indicates mbox
-          $method = \&scan_mailbox;
+          $format = 'mbox';
         } 
 	elsif ($location eq '-' || !(-d $location)) {
 	  # stdin is considered a file if not passed as mbox
-          $method = \&scan_file;
+          $format = 'file';
 	}
 	else {
 	  # it's a directory
-	  $method = \&scan_directory;
-	}
-      }
-      else {
-	if ($format eq "dir") {
-	  $method = \&scan_directory;
-	}
-	elsif ($format eq "file") {
-	  $method = \&scan_file;
-	}
-	elsif ($format eq "mbox") {
-	  $method = \&scan_mailbox;
-        }
-	elsif ($format eq "mbx") {
-	  $method = \&scan_mbx;
+	  $format = 'dir';
 	}
       }
 
-      if(defined($method)) {
-	&{$method}($self, $class, $location);
+      if ($format eq 'dir') {
+        $method = \&scan_directory;
+      }
+      elsif ($format eq 'mbox') {
+        $method = \&scan_mailbox;
+      }
+      elsif ($format eq 'file') {
+        $method = \&scan_file;
+      }
+      elsif ($format eq 'mbx') {
+        $method = \&scan_mbx;
       }
       else {
 	warn "archive-iterator: format $format unknown!";
+        next;
       }
-    }
-  }
-
-  $self->top_and_tail_messages($self->{h}, $class_opts{h});
-  $self->top_and_tail_messages($self->{s}, $class_opts{s});
 
-  my $messages;
-  if ($self->{opt_n}) {
-    # OPT_N == 1 means don't bother sorting on message receive date
-
-    # for ease of memory, we'll play with pointers
-    $messages = $self->{s};
-    undef $self->{s};
-    push(@{$messages}, @{$self->{h}});
-    undef $self->{h};
-  }
-  else {
-    # OPT_N == 0 means sort on message receive date
-
-    # Sort the spam and ham groups by date
-    my @s = @{$self->{s}};
-    undef $self->{s};
-    my @h = @{$self->{h}};
-    undef $self->{h};
-
-    # interleave ordered spam and ham
-    if (@s && @h) {
-      my $ratio = @s / @h;
-      while (@s && @h) {
-	push @{$messages}, (@s / @h > $ratio) ? (shift @s) : (shift @h);
-      }
+      # call the appropriate method
+      &{$method}($self, $class, $location, $bkfunc);
     }
-    # push the rest onto the end
-    push @{$messages}, @s, @h;
   }
-
-  # head or tail < 0 means crop the total list, negate the value appropriately
-  if ($self->{opt_tail} < 0) {
-    splice(@{$messages}, 0, $self->{opt_tail});
-  }
-  if ($self->{opt_head} < 0) {
-    splice(@{$messages}, -$self->{opt_head});
-  }
-
-  return scalar(@{$messages}), $messages;
 }
 
 sub mail_open {
@@ -605,50 +537,11 @@
 
 sub set_default_message_selection_opts {
   my ($self) = @_;
-  $self->{opt_head} = 0 unless (defined $self->{opt_head});
-  $self->{opt_tail} = 0 unless (defined $self->{opt_tail});
   $self->{opt_scanprob} = 1.0 unless (defined $self->{opt_scanprob});
   $self->{opt_want_date} = 1 unless (defined $self->{opt_want_date});
   $self->{opt_cache} = 0 unless (defined $self->{opt_cache});
 }
 
-sub top_and_tail_messages {
-  my ($self, $ary, $opts) = @_;
-
-  foreach my $k (keys %{$opts}) {
-    $self->{$k} = $opts->{$k};
-  }
-  $self->set_default_message_selection_opts();
-
-  if ($self->{opt_n}) {
-    # OPT_N == 1 means don't bother sorting on message receive date
-
-    # head or tail > 0 means crop each list
-    if ($self->{opt_tail} > 0) {
-      splice(@{$ary}, 0, -$self->{opt_tail});
-    }
-    if ($self->{opt_head} > 0) {
-      splice(@{$ary}, min ($self->{opt_head}, scalar @{$ary}));
-    }
-  }
-  else {
-    # OPT_N == 0 means sort on message receive date
-
-    # Sort the spam and ham groups by date
-    my @s = sort { $a cmp $b } @{$ary};
-
-    # head or tail > 0 means crop each list
-    if ($self->{opt_tail} > 0) {
-      splice(@s, 0, -$self->{opt_tail});
-    }
-    if ($self->{opt_head} > 0) {
-      splice(@s, min ($self->{opt_head}, scalar @s));
-    }
-
-    @{$ary} = @s;
-  }
-}
-
 ############################################################################
 
 sub message_is_useful_by_date {
@@ -717,7 +610,7 @@
 ############################################################################
 
 sub scan_directory {
-  my ($self, $class, $folder) = @_;
+  my ($self, $class, $folder, $bkfunc) = @_;
 
   my @files;
 
@@ -744,7 +637,7 @@
   $self->create_cache('dir', $folder);
 
   foreach my $mail (@files) {
-    $self->scan_file($class, $mail);
+    $self->scan_file($class, $mail, $bkfunc);
   }
 
   if (defined $AICache) {
@@ -753,43 +646,44 @@
 }
 
 sub scan_file {
-  my ($self, $class, $mail) = @_;
+  my ($self, $class, $mail, $bkfunc) = @_;
 
   $self->bump_scan_progress();
 
   my @s = stat($mail);
   return unless $self->message_is_useful_by_file_modtime($s[9]);
 
-  if (!$self->{determine_receive_date}) {
-    push(@{$self->{$class}}, index_pack(AI_TIME_UNKNOWN, $class, "f", $mail));
-    return;
-  }
+  my $date = AI_TIME_UNKNOWN;
 
-  my $date;
-  unless (defined $AICache and $date = $AICache->check($mail)) {
-    my $header;
-    if (!mail_open($mail)) {
-      $self->{access_problem} = 1;
-      return;
-    }
-    while (<INPUT>) {
-      last if /^\s*$/;
-      $header .= $_;
-    }
-    close(INPUT);
-    $date = Mail::SpamAssassin::Util::receive_date($header);
-    if (defined $AICache) {
-      $AICache->update($mail, $date);
+  if ($self->{determine_receive_date}) {
+    unless (defined $AICache and $date = $AICache->check($mail)) {
+      my $header;
+      if (!mail_open($mail)) {
+        $self->{access_problem} = 1;
+        return;
+      }
+      while (<INPUT>) {
+        last if /^\s*$/;
+        $header .= $_;
+      }
+      close(INPUT);
+      $date = Mail::SpamAssassin::Util::receive_date($header);
+      if (defined $AICache) {
+        $AICache->update($mail, $date);
+      }
     }
+
+    return if !$self->message_is_useful_by_date($date);
+    return if !$self->scanprob_says_scan();
   }
 
-  return if !$self->message_is_useful_by_date($date);
-  return if !$self->scanprob_says_scan();
-  push(@{$self->{$class}}, index_pack($date, $class, "f", $mail));
+  &{$bkfunc}($self, $date, $class, 'f', $mail);
+
+  return;
 }
 
 sub scan_mailbox {
-  my ($self, $class, $folder) = @_;
+  my ($self, $class, $folder, $bkfunc) = @_;
   my @files;
 
   if ($folder ne '-' && -d $folder) {
@@ -884,7 +778,7 @@
       }
       next if !$self->scanprob_says_scan();
 
-      push(@{$self->{$class}}, index_pack($v, $class, "m", "$file.$k"));
+      &{$bkfunc}($self, $v, $class, 'm', "$file.$k");
     }
 
     if (defined $AICache) {
@@ -894,7 +788,7 @@
 }
 
 sub scan_mbx {
-  my ($self, $class, $folder) = @_;
+  my ($self, $class, $folder, $bkfunc) = @_;
   my (@files, $fp);
 
   if ($folder ne '-' && -d $folder) {
@@ -992,7 +886,7 @@
       }
       next if !$self->scanprob_says_scan();
 
-      push(@{$self->{$class}}, index_pack($v, $class, "b", "$file.$k"));
+      &{$bkfunc}($self, $v, $class, 'b', "$file.$k");
     }
 
     if (defined $AICache) {
@@ -1044,10 +938,6 @@
     # return csh-style globs: ./corpus/*.mbox => er, you know what it does ;)
     return glob($path);
   }
-}
-
-sub min {
-  return ($_[0] < $_[1] ? $_[0] : $_[1]);
 }
 
 sub create_cache {

Modified: spamassassin/trunk/masses/mass-check
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/mass-check?view=diff&rev=481710&r1=481709&r2=481710
==============================================================================
--- spamassassin/trunk/masses/mass-check (original)
+++ spamassassin/trunk/masses/mass-check Sat Dec  2 20:13:09 2006
@@ -149,6 +149,8 @@
 $opt_c = "$FindBin::Bin/../rules";
 $opt_p = "$FindBin::Bin/spamassassin";
 $opt_j = 1;
+$opt_head = 0;
+$opt_tail = 0;
 $opt_net = 0;
 $opt_hamlog = "ham.log";
 $opt_spamlog = "spam.log";
@@ -321,9 +323,6 @@
   }
 
   # ArchiveIterator options for non-client mode
-  $AIopts->{'opt_n'} = $opt_n;
-  $AIopts->{'opt_head'} = $opt_head;
-  $AIopts->{'opt_tail'} = $opt_tail;
   $AIopts->{'opt_scanprob'} = $opt_scanprob;
   $AIopts->{'opt_cache'} = $opt_cache;
   $AIopts->{'opt_cachedir'} = $opt_cachedir;
@@ -338,7 +337,7 @@
 }
 else {
   # ArchiveIterator options for client mode -- tends to be simple
-  $AIopts->{'opt_n'} = 1;
+  $opt_n = 1;
 }
 
 ###########################################################################
@@ -377,12 +376,7 @@
     }
     elsif (defined $tmpf) {
       # child -- process using message_array
-      my($num, $messages) = $iter->message_array(\@targets);
-
-      # Dump out the number of messages and the message index info to
-      # the temp file
-      send_line($tmpfd, $num, @{$messages});
-
+      generate_queue(\@targets, $tmpfd);
       exit;
     }
     else {
@@ -391,11 +385,7 @@
   }
   else {
     # we get here if opt_j == 0, so scan in this process
-    my($num, $messages) = $iter->message_array(\@targets);
-
-    # Dump out the number of messages and the message index info to
-    # the temp file
-    send_line($tmpfd, $num, @{$messages});
+    generate_queue(\@targets, $tmpfd);
   }
 
   # we now have a temporary file with the messages to process
@@ -450,8 +440,6 @@
   # message-selection options; these can now be specified separately
   # for each target
   my %selopts = (
-    opt_head => $opt_head,
-    opt_tail => $opt_tail,
     opt_scanprob => $opt_scanprob,
     opt_after => $opt_after,
     opt_before => $opt_before
@@ -1868,4 +1856,100 @@
   if ($opt_before && $opt_after && $opt_after >= $opt_before) {
     die "--before ($opt_before) <= --after ($opt_after) -- conflict!";
   }
+}
+
+sub generate_queue {
+  my ($targets, $tmpfd) = @_;
+
+  # scan the targets and get the number and list of messages
+  $iter->scan_targets($targets,
+    sub {
+      my($self, $date, $class, $format, $mail) = @_;
+      push(@{$self->{$class}}, Mail::SpamAssassin::ArchiveIterator::index_pack($date, $class, $format, $mail));
+    }
+  );
+
+  # deal with opt_head and opt_tail
+  top_and_tail_messages($iter->{h});
+  top_and_tail_messages($iter->{s});
+
+  my $messages;
+  if ($opt_n) {
+    # OPT_N == 1 means don't bother sorting on message receive date
+
+    # for ease of memory, we'll play with pointers
+    $messages = $iter->{s};
+    undef $iter->{s};
+    push(@{$messages}, @{$iter->{h}});
+    undef $iter->{h};
+  }
+  else {
+    # OPT_N == 0 means sort on message receive date
+
+    # Sort the spam and ham groups by date
+    my @s = @{$iter->{s}};
+    undef $iter->{s};
+    my @h = @{$iter->{h}};
+    undef $iter->{h};
+
+    # interleave ordered spam and ham
+    if (@s && @h) {
+      my $ratio = @s / @h;
+      while (@s && @h) {
+	push @{$messages}, (@s / @h > $ratio) ? (shift @s) : (shift @h);
+      }
+    }
+    # push the rest onto the end
+    push @{$messages}, @s, @h;
+  }
+
+  # head or tail < 0 means crop the total list, negate the value appropriately
+  if ($opt_tail < 0) {
+    splice(@{$messages}, 0, $opt_tail);
+  }
+  if ($opt_head < 0) {
+    splice(@{$messages}, -$opt_head);
+  }
+
+  my $num = $Mail::SpamAssassin::ArchiveIterator::MESSAGES = scalar(@{$messages});
+
+  # Dump out the number of messages and the message index info to
+  # the temp file
+  send_line($tmpfd, $num, @{$messages});
+}
+
+sub top_and_tail_messages {
+  my ($ary) = @_;
+
+  if ($opt_n) {
+    # OPT_N == 1 means don't bother sorting on message receive date
+
+    # head or tail > 0 means crop each list
+    if ($opt_tail > 0) {
+      splice(@{$ary}, 0, -$opt_tail);
+    }
+    if ($opt_head > 0) {
+      splice(@{$ary}, min ($opt_head, scalar @{$ary}));
+    }
+  }
+  else {
+    # OPT_N == 0 means sort on message receive date
+
+    # Sort the spam and ham groups by date
+    my @s = sort { $a cmp $b } @{$ary};
+
+    # head or tail > 0 means crop each list
+    if ($opt_tail > 0) {
+      splice(@s, 0, -$opt_tail);
+    }
+    if ($opt_head > 0) {
+      splice(@s, min ($opt_head, scalar @s));
+    }
+
+    @{$ary} = @s;
+  }
+}
+
+sub min {
+  return ($_[0] < $_[1] ? $_[0] : $_[1]);
 }

Modified: spamassassin/trunk/sa-learn.raw
URL: http://svn.apache.org/viewvc/spamassassin/trunk/sa-learn.raw?view=diff&rev=481710&r1=481709&r2=481710
==============================================================================
--- spamassassin/trunk/sa-learn.raw (original)
+++ spamassassin/trunk/sa-learn.raw Sat Dec  2 20:13:09 2006
@@ -406,7 +406,6 @@
 
   my $iter = new Mail::SpamAssassin::ArchiveIterator(
     {
-      'opt_n'   => 1,
       'opt_all' => 0,       # skip messages over 250k
     }
   );

Modified: spamassassin/trunk/spamassassin.raw
URL: http://svn.apache.org/viewvc/spamassassin/trunk/spamassassin.raw?view=diff&rev=481710&r1=481709&r2=481710
==============================================================================
--- spamassassin/trunk/spamassassin.raw (original)
+++ spamassassin/trunk/spamassassin.raw Sat Dec  2 20:13:09 2006
@@ -312,7 +312,6 @@
 # Everything below here needs ArchiveIterator ...
 my $iter = new Mail::SpamAssassin::ArchiveIterator(
   {
-    'opt_n'   => 1,
     'opt_all' => 1,
     'opt_want_date' => 0
   }

Modified: spamassassin/trunk/tools/split_corpora
URL: http://svn.apache.org/viewvc/spamassassin/trunk/tools/split_corpora?view=diff&rev=481710&r1=481709&r2=481710
==============================================================================
--- spamassassin/trunk/tools/split_corpora (original)
+++ spamassassin/trunk/tools/split_corpora Sat Dec  2 20:13:09 2006
@@ -55,7 +55,6 @@
 my $current_bucket = 0;
 
 my $iter = new Mail::SpamAssassin::ArchiveIterator({
-        'opt_n' => 1,
         'opt_all' => 1,
   });