You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by fe...@apache.org on 2004/10/30 05:14:39 UTC

svn commit: rev 56013 - spamassassin/trunk/lib/Mail/SpamAssassin

Author: felicity
Date: Fri Oct 29 20:14:39 2004
New Revision: 56013

Modified:
   spamassassin/trunk/lib/Mail/SpamAssassin/ArchiveIterator.pm
Log:
A bunch of changes to ArchiveIterator.

First, functions were moved around to be grouped into the two operational
sections that AI has: scanning (finds messages) and running (processes
messages).

Second, added some more comments to document what is going on.

Third, the old code for --head and --tail looked like it didn't work as
documented, specifically the -N values.

Fourth, and this is the big one, a massive change to the internals of AI.
Instead of storing the messages in a large hash, storing the receive date
multiple times per record, doing complicated sorts of data, etc.

In the end, my testing found the scan time and memory usage both went
down pretty significantly (20-50%).  For a generic nightly run (~100k
messages), opt_n==0 went from 40s/48M to 32s/26M.  opt_n==1 from 10s/54M
to 5s/30M.  Most dramatically, for a full corpus run (~2.4m messages),
opt_n==1, 911s/964M to 440s/447M.

Testing also shows that the new version returns slightly differently
ordered (expected) but the same total data.



Modified: spamassassin/trunk/lib/Mail/SpamAssassin/ArchiveIterator.pm
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/ArchiveIterator.pm	(original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/ArchiveIterator.pm	Fri Oct 29 20:14:39 2004
@@ -176,8 +176,11 @@
   if (!defined $self) { $self = { }; }
   bless ($self, $class);
 
-  $self->{s} = { };		# spam, of course
-  $self->{h} = { };		# ham, as if you couldn't guess
+  $self->{opt_head} = 0 unless exists $self->{opt_head};
+  $self->{opt_tail} = 0 unless exists $self->{opt_tail};
+
+  $self->{s} = [ ];		# spam, of course
+  $self->{h} = [ ];		# ham, as if you couldn't guess
 
   $self;
 }
@@ -232,6 +235,8 @@
 front of the value will be replaced by the C<HOME> environment variable.
 Escaped whitespace is protected as well.
 
+B<NOTE:> C<~user> is not allowed.
+
 =back
 
 =cut
@@ -312,7 +317,7 @@
 	  my $line;
 	  while ($line = readline $socket) {
 	    if ($line =~ /^RESULT (.+)$/) {
-	      my ($class,$type,$date) = index_unpack($1);
+	      my ($date,$class,$type) = run_index_unpack($1);
 	      #warn ">> RESULT: $class, $type, $date\n";
 
 	      if (defined $self->{opt_restart} &&
@@ -397,100 +402,127 @@
 
 ############################################################################
 
-sub message_array {
-  my ($self, $targets, $fh) = @_;
-
-  foreach my $target (@${targets}) {
-    my ($class, $format, $rawloc) = split(/:/, $target, 3);
+## run_message and related functions to process a single message
 
-    # use ham by default, things like "spamassassin" can't specify the type
-    $class = substr($class, 0, 1) || 'h';
+sub run_message {
+  my ($self, $msg) = @_;
 
-    my @locations = $self->fix_globs($rawloc);
+  my ($date, $class, $format, $mail) = run_index_unpack($msg);
 
-    foreach my $location (@locations) {
-      my $method;
+  if ($format eq "f") {
+    return $self->run_file($class, $format, $mail, $date);
+  }
+  elsif ($format eq "m") {
+    return $self->run_mailbox($class, $format, $mail, $date);
+  }
+  elsif ($format eq "b") {
+    return $self->run_mbx($class, $format, $mail, $date);
+  }
+}
 
-      if ($format eq 'detect') {
-	# detect the format
-	if ($location eq '-' || !(-d $location)) {
-	  # stdin is considered a file if not passed as mbox
-	  $method = \&scan_file;
-	}
-	else {
-	  # it's a directory
-	  $method = \&scan_directory;
-	}
-      }
-      else {
-	if ($format eq "dir") {
-	  $method = \&scan_directory;
-	}
-	elsif ($format eq "file") {
-	  $method = \&scan_file;
-	}
-	elsif ($format eq "mbox") {
-	  $method = \&scan_mailbox;
-        }
-	elsif ($format eq "mbx") {
-	  $method = \&scan_mbx;
-	}
-      }
+sub run_file {
+  my ($self, $class, $format, $where, $date) = @_;
 
-      if(defined($method)) {
-	&{$method}($self, $class, $location);
-      }
-      else {
-	warn "archive-iterator: format $format unknown!";
-      }
-    }
+  mail_open($where) or return;
+  # skip too-big mails
+  if (! $self->{opt_all} && -s INPUT > BIG_BYTES) {
+    close INPUT;
+    return;
   }
+  my @msg;
+  my $header = '';
+  while(<INPUT>) {
+    if (!$header && /^$/) {
+      $header = join('', @msg);
+    }
 
-  my @messages;
-  if ($self->{opt_n}) {
-    # do two small sorts, one for each class of message
-    # the sort is only useful for mbox/mbx files so that we go through the
-    # files completely (in order?) before moving onto another one
-    @messages = ( sort keys(%{$self->{s}}), sort keys(%{$self->{h}}) );
-    undef $self->{s};
-    undef $self->{h};
+    push(@msg, $_);
+  }
+  close INPUT;
 
-    # head and tail are really non-deterministic with opt_n
-    splice(@messages, $self->{opt_head}) if $self->{opt_head};
-    splice(@messages, 0, -$self->{opt_tail}) if $self->{opt_tail};
+  if ($date == AI_TIME_UNKNOWN) {
+    $date = Mail::SpamAssassin::Util::receive_date($header);
   }
-  else {
-    my @s = sort({ $self->{s}->{$a} <=> $self->{s}->{$b} } keys %{$self->{s}});
-    undef $self->{s};
-    my @h = sort({ $self->{h}->{$a} <=> $self->{h}->{$b} } keys %{$self->{h}});
-    undef $self->{h};
 
-    if ($self->{opt_head}) {
-      splice(@s, $self->{opt_head});
-      splice(@h, $self->{opt_head});
+  return($class, $format, $date, $where, &{$self->{wanted_sub}}($class, $where, $date, \@msg));
+}
+
+sub run_mailbox {
+  my ($self, $class, $format, $where, $date) = @_;
+
+  my ($file, $offset) = ($where =~ m/(.*)\.(\d+)$/);
+  my @msg;
+  my $header = '';
+  mail_open($file) or return;
+  seek(INPUT,$offset,0);
+  my $past = 0;
+  while (<INPUT>) {
+    if ($past) {
+      last if substr($_,0,5) eq "From ";
     }
-    if ($self->{opt_tail}) {
-      splice(@s, 0, -$self->{opt_tail});
-      splice(@h, 0, -$self->{opt_tail});
+    else {
+      $past = 1;
     }
 
-    # interleave ordered spam and ham
-    while (@s && @h) {
-      push @messages, (shift @s), (shift @h);
+    # skip too-big mails
+    if (! $self->{opt_all} && @msg > BIG_LINES) {
+      close INPUT;
+      return;
     }
 
-    # push the rest onto the end
-    push @messages, @s, @h;
+    if (!$header && /^$/) {
+      $header = join('', @msg);
+    }
+
+    push (@msg, $_);
   }
+  close INPUT;
 
-  if (defined $fh) {
-    print { $fh } map { "$_\n" } scalar(@messages), @messages;
-    return;
+  if ($date == AI_TIME_UNKNOWN) {
+    $date = Mail::SpamAssassin::Util::receive_date($header);
   }
 
-  return scalar(@messages), \@messages;
+  return($class, $format, $date, $where, &{$self->{wanted_sub}}($class, $where, $date, \@msg));
 }
 
+sub run_mbx {
+  my ($self, $class, $format, $where, $date) = @_;
+
+  my ($file, $offset) = ($where =~ m/(.*)\.(\d+)$/);
+  my @msg;
+  my $header = '';
+
+  mail_open($file) or return;
+  seek(INPUT, $offset, 0);
+    
+  while (<INPUT>) {
+    last if ($_ =~ MBX_SEPARATOR);
+	
+    # skip mails that are too big
+    if (! $self->{opt_all} && @msg > BIG_LINES) {
+      close INPUT;
+      return;
+    }
+
+    if (!$header && /^$/) {
+      $header = join('', @msg);
+    }
+
+    push (@msg, $_);
+  }
+  close INPUT;
+
+  if ($date == AI_TIME_UNKNOWN) {
+    $date = Mail::SpamAssassin::Util::receive_date($header);
+  }
+
+  return($class, $format, $date, $where, &{$self->{wanted_sub}}($class, $where, $date, \@msg));
+}
+
+############################################################################
+
+## figure out the next message to process, used when opt_j >= 1
+
 sub next_message {
   my ($self) = @_;
   my $line = readline $self->{messageh};
@@ -498,6 +530,10 @@
   return $line;
 }
 
+############################################################################
+
+## children processors, start and process, used when opt_j > 1
+
 sub start_children {
   my ($self, $count, $child, $pid, $socket) = @_;
 
@@ -545,7 +581,7 @@
 	# but run_message would have calculated it, so reset the packed
 	# version if possible ...
         if ($self->{opt_n} && $class && $format && defined $date && $where) {
-	  $line = index_pack($class, $format, $date, $where);
+	  $line = run_index_pack($date, $class, $format, $where);
         }
 
 	print "$result\nRESULT $line\n";
@@ -558,6 +594,8 @@
   }
 }
 
+## handling killing off the children
+
 sub reap_children {
   my ($self, $count, $socket, $pid) = @_;
 
@@ -576,6 +614,142 @@
   }
 }
 
+############################################################################
+
+# 0 850852128			atime
+# 1 h				class
+# 2 m				format
+# 3 ./ham/goodmsgs.0		path
+
+sub run_index_pack {
+  return join("\000", @_);
+}
+
+sub run_index_unpack {
+  return split(/\000/, $_[0]);
+}
+
+############################################################################
+
+## FUNCTIONS BELOW THIS POINT ARE FOR FINDING THE MESSAGES TO RUN AGAINST
+
+############################################################################
+
+sub message_array {
+  my ($self, $targets, $fh) = @_;
+
+  foreach my $target (@${targets}) {
+    my ($class, $format, $rawloc) = split(/:/, $target, 3);
+
+    # use ham by default, things like "spamassassin" can't specify the type
+    $class = substr($class, 0, 1) || 'h';
+
+    my @locations = $self->fix_globs($rawloc);
+
+    foreach my $location (@locations) {
+      my $method;
+
+      if ($format eq 'detect') {
+	# detect the format
+	if ($location eq '-' || !(-d $location)) {
+	  # stdin is considered a file if not passed as mbox
+	  $method = \&scan_file;
+	}
+	else {
+	  # it's a directory
+	  $method = \&scan_directory;
+	}
+      }
+      else {
+	if ($format eq "dir") {
+	  $method = \&scan_directory;
+	}
+	elsif ($format eq "file") {
+	  $method = \&scan_file;
+	}
+	elsif ($format eq "mbox") {
+	  $method = \&scan_mailbox;
+        }
+	elsif ($format eq "mbx") {
+	  $method = \&scan_mbx;
+	}
+      }
+
+      if(defined($method)) {
+	&{$method}($self, $class, $location);
+      }
+      else {
+	warn "archive-iterator: format $format unknown!";
+      }
+    }
+  }
+
+  my @messages;
+  if ($self->{opt_n}) {
+    # head or tail > 0 means crop each list
+    if ($self->{opt_head} > 0) {
+      splice(@{$self->{s}}, $self->{opt_head});
+      splice(@{$self->{h}}, $self->{opt_head});
+    }
+    if ($self->{opt_tail} > 0) {
+      splice(@{$self->{s}}, 0, -$self->{opt_tail});
+      splice(@{$self->{h}}, 0, -$self->{opt_tail});
+    }
+
+    @messages = ( @{$self->{s}}, @{$self->{h}} );
+    undef $self->{s};
+    undef $self->{h};
+  }
+  else {
+    # Sort the spam and ham groups by date
+    my @s = sort { $a cmp $b } @{$self->{s}};
+    undef $self->{s};
+    my @h = sort { $a cmp $b } @{$self->{h}};
+    undef $self->{h};
+
+    # head or tail > 0 means crop each list
+    if ($self->{opt_head} > 0) {
+      splice(@s, $self->{opt_head});
+      splice(@h, $self->{opt_head});
+    }
+    if ($self->{opt_tail} > 0) {
+      splice(@s, 0, -$self->{opt_tail});
+      splice(@h, 0, -$self->{opt_tail});
+    }
+
+    # interleave ordered spam and ham
+    while (@s && @h) {
+      push @messages, (shift @s), (shift @h);
+    }
+
+    # push the rest onto the end
+    push @messages, @s, @h;
+  }
+
+  # head or tail < 0 means crop the total list, negate the value appropriately
+  if ($self->{opt_head} < 0) {
+    splice(@messages, -$self->{opt_head});
+  }
+  if ($self->{opt_tail} < 0) {
+    splice(@messages, 0, $self->{opt_tail});
+  }
+
+  # Convert scan index format to run index format
+  # TODO: figure out a better scan index format which doesn't include newlines
+  # so readline() works ...
+  foreach (@messages) {
+    $_ = run_index_pack(scan_index_unpack($_));
+  }
+
+  # Dump out the messages to the temp file if we're using one
+  if (defined $fh) {
+    print { $fh } map { "$_\n" } scalar(@messages), @messages;
+    return;
+  }
+
+  return scalar(@messages), \@messages;
+}
+
 sub mail_open {
   my ($file) = @_;
 
@@ -618,14 +792,22 @@
 
 ############################################################################
 
-sub index_pack {
-  return join("\000", @_);
+# 0 850852128			atime
+# 1 h				class
+# 2 m				format
+# 3 ./ham/goodmsgs.0		path
+
+sub scan_index_pack {
+  # with opt_n, put the date first, and pack it.  faster for sorting...
+  return pack("NAAA*", @_);
 }
 
-sub index_unpack {
-  return split(/\000/, $_[0]);
+sub scan_index_unpack {
+  return unpack("NAAA*", $_[0]);
 }
 
+############################################################################
+
 sub scan_directory {
   my ($self, $class, $folder) = @_;
 
@@ -654,7 +836,7 @@
   my ($self, $class, $mail) = @_;
 
   if ($self->{opt_n}) {
-    $self->{$class}->{index_pack($class, "f", AI_TIME_UNKNOWN, $mail)} = AI_TIME_UNKNOWN;
+    push(@{$self->{$class}}, scan_index_pack(AI_TIME_UNKNOWN, $class, "f", $mail));
     return;
   }
   my $header;
@@ -666,7 +848,7 @@
   close(INPUT);
   my $date = Mail::SpamAssassin::Util::receive_date($header);
   return if !$self->message_is_useful_by_date($date);
-  $self->{$class}->{index_pack($class, "f", $date, $mail)} = $date;
+  push(@{$self->{$class}}, scan_index_pack($date, $class, "f", $mail));
 }
 
 sub scan_mailbox {
@@ -728,7 +910,7 @@
 	  next if !$self->message_is_useful_by_date($date);
 	}
 
-	$self->{$class}->{index_pack($class, "m", $date, "$file.$offset")} = $date;
+	push(@{$self->{$class}}, scan_index_pack($date, $class, "m", "$file.$offset"));
       }
     }
     close INPUT;
@@ -788,7 +970,7 @@
 		  next if !$self->message_is_useful_by_date($date);
 		}
 
-		$self->{$class}->{index_pack($class, "b", $date, "$file.$offset")} = $date;
+		push(@{$self->{$class}}, scan_index_pack($date, $class, "b", "$file.$offset"));
 
 		seek(INPUT, $offset + $size, 0);
 	    } else {
@@ -801,128 +983,11 @@
 
 ############################################################################
 
-sub run_message {
-  my ($self, $msg) = @_;
-
-  my ($class, $format, $date, $mail) = index_unpack($msg);
-
-  if ($format eq "f") {
-    return $self->run_file($class, $format, $mail, $date);
-  }
-  elsif ($format eq "m") {
-    return $self->run_mailbox($class, $format, $mail, $date);
-  }
-  elsif ($format eq "b") {
-    return $self->run_mbx($class, $format, $mail, $date);
-  }
-}
-
-sub run_file {
-  my ($self, $class, $format, $where, $date) = @_;
-
-  mail_open($where) or return;
-  # skip too-big mails
-  if (! $self->{opt_all} && -s INPUT > BIG_BYTES) {
-    close INPUT;
-    return;
-  }
-  my @msg;
-  my $header = '';
-  while(<INPUT>) {
-    if (!$header && /^$/) {
-      $header = join('', @msg);
-    }
-
-    push(@msg, $_);
-  }
-  close INPUT;
-
-  if ($date == AI_TIME_UNKNOWN) {
-    $date = Mail::SpamAssassin::Util::receive_date($header);
-  }
-
-  return($class, $format, $date, $where, &{$self->{wanted_sub}}($class, $where, $date, \@msg));
-}
-
-sub run_mailbox {
-  my ($self, $class, $format, $where, $date) = @_;
-
-  my ($file, $offset) = ($where =~ m/(.*)\.(\d+)$/);
-  my @msg;
-  my $header = '';
-  mail_open($file) or return;
-  seek(INPUT,$offset,0);
-  my $past = 0;
-  while (<INPUT>) {
-    if ($past) {
-      last if substr($_,0,5) eq "From ";
-    }
-    else {
-      $past = 1;
-    }
-
-    # skip too-big mails
-    if (! $self->{opt_all} && @msg > BIG_LINES) {
-      close INPUT;
-      return;
-    }
-
-    if (!$header && /^$/) {
-      $header = join('', @msg);
-    }
-
-    push (@msg, $_);
-  }
-  close INPUT;
-
-  if ($date == AI_TIME_UNKNOWN) {
-    $date = Mail::SpamAssassin::Util::receive_date($header);
-  }
-
-  return($class, $format, $date, $where, &{$self->{wanted_sub}}($class, $where, $date, \@msg));
-}
-
-sub run_mbx {
-  my ($self, $class, $format, $where, $date) = @_;
-
-  my ($file, $offset) = ($where =~ m/(.*)\.(\d+)$/);
-  my @msg;
-  my $header = '';
-
-  mail_open($file) or return;
-  seek(INPUT, $offset, 0);
-    
-  while (<INPUT>) {
-    last if ($_ =~ MBX_SEPARATOR);
-	
-    # skip mails that are too big
-    if (! $self->{opt_all} && @msg > BIG_LINES) {
-      close INPUT;
-      return;
-    }
-
-    if (!$header && /^$/) {
-      $header = join('', @msg);
-    }
-
-    push (@msg, $_);
-  }
-  close INPUT;
-
-  if ($date == AI_TIME_UNKNOWN) {
-    $date = Mail::SpamAssassin::Util::receive_date($header);
-  }
-
-  return($class, $format, $date, $where, &{$self->{wanted_sub}}($class, $where, $date, \@msg));
-}
-
-############################################################################
-
 sub fix_globs {
   my ($self, $path) = @_;
 
   # replace leading tilde with home dir: ~/abc => /home/jm/abc
-  $path =~ s/^~/$ENV{'HOME'}/;
+  $path =~ s!^~/!$ENV{'HOME'}!;
 
   # protect/escape spaces: ./Mail/My Letters => ./Mail/My\ Letters
   $path =~ s/([^\\])(\s)/$1\\$2/g;