You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by fe...@apache.org on 2004/02/18 07:47:20 UTC

svn commit: rev 6716 - in incubator/spamassassin/trunk: lib/Mail lib/Mail/SpamAssassin t

Author: felicity
Date: Tue Feb 17 22:47:19 2004
New Revision: 6716

Modified:
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin.pm
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm
   incubator/spamassassin/trunk/t/mimeparse.t
Log:
make M::SA::parse() not do the full recursive mime tree generation.  let M::SA::MsgContainer::find_parts() do it as necessary.  this makes check() run at about the same speed, but things like 'spamassassin -d' should run a little faster due to the reduced amount of work that needs to occur.

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin.pm	Tue Feb 17 22:47:19 2004
@@ -308,22 +308,28 @@
 of the message with 1 line per array element, or a file glob with the
 entire contents of the message.
 
-The procedure used to parse a message is recursive and ends up
-generating a tree of M::SA::MsgContainer objects.  parse() will generate
-the parent node of the tree, then pass the body of the message to
-M::SA::MsgParser->parse_body() which begins the recursive process.
+This function will return a base MsgContainer object with just the headers
+being parsed.  M::SA::MsgContainer->find_parts() will end up doing a
+full recursive mime parse of the message as necessary.  That procedure is
+recursive and ends up generating a tree of M::SA::MsgContainer objects.
+parse() will generate the parent node of the tree, then pass the body of
+the message to M::SA::MsgParser->parse_body() which begins the recursive
+process.
 
 =cut
 
+# NOTE: This function is allowed (in bad OO form) to modify the
+# MsgContainer object directly as MsgContainer doesn't really have a
+# constructor in the traditional OO way of things.
+
 sub parse {
   my($self, $message) = @_;
   $message ||= \*STDIN;
 
-  dbg("---- MIME PARSER START ----");
-
   # protect it from abuse ...
   local $_;
 
+  # Figure out how the message was passed to us, and deal with it.
   my @message;
   if (ref $message eq 'ARRAY') {
      @message = @{$message};
@@ -338,7 +344,7 @@
   }
 
   # Generate the main object and parse the appropriate MIME-related headers into it.
-  my $msg = Mail::SpamAssassin::MsgContainer->new();
+  my $msg = Mail::SpamAssassin::MsgContainer->new(already_parsed => 0);
   my $header = '';
   $msg->{'pristine_headers'} = '';
 
@@ -371,7 +377,8 @@
     last if ( $last =~ /^\r?$/m );
   }
 
-  # Store the pristine body for later -- store as a copy since @message will get modified below
+  # Store the pristine body for later -- store as a copy since @message
+  # will get modified below
   $msg->{'pristine_body'} = join('', @message);
 
   # CRLF -> LF
@@ -379,15 +386,11 @@
     s/\r\n/\n/;
   }
 
-  # Figure out the boundary
-  my ($boundary);
-  ($msg->{'type'}, $boundary) = Mail::SpamAssassin::Util::parse_content_type($msg->header('content-type'));
-  dbg("main message type: ".$msg->{'type'});
-
-  # Make the tree
-  Mail::SpamAssassin::MsgParser->parse_body( $msg, $msg, $boundary, \@message, 1 );
-
-  dbg("---- MIME PARSER END ----");
+  # If the message does need to get parsed, save off a copy of the body
+  # in a format we can easily parse later so we don't have to rip from
+  # pristine_body ...
+  #
+  $msg->{'toparse'} = \@message;
 
   return $msg;
 }

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm	Tue Feb 17 22:47:19 2004
@@ -50,7 +50,7 @@
 sub new {
   my $class = shift;
   $class = ref($class) || $class;
-#  my %opts = @_;
+  my %opts = @_;
 
   my $self = {
     headers		=> {},
@@ -58,18 +58,52 @@
     metadata		=> {},
     body_parts		=> [],
     header_order	=> [],
+    already_parsed	=> 1,
     };
 
-#  # allow callers to set certain options ...
-#  foreach ( 'noexit' ) {
-#    $self->{$_} = $opts{$_} if ( exists $opts{$_} );
-#  }
+  # allow callers to set certain options ...
+  foreach ( 'already_parsed' ) {
+    $self->{$_} = $opts{$_} if ( exists $opts{$_} );
+  }
 
   bless($self,$class);
 
   $self;
 }
 
+=item _do_parse()
+
+Non-Public function which will initiate a MIME part part (generates
+a tree) of the current message.  Typically called by find_parts()
+as necessary.
+
+=cut
+
+sub _do_parse {
+  my($self) = @_;
+
+  # If we're called when we don't need to be, then just go ahead and return.
+  return if ($self->{'already_parsed'});
+
+  my $toparse = $self->{'toparse'};
+  delete $self->{'toparse'};
+
+  dbg("---- MIME PARSER START ----");
+
+  # Figure out the boundary
+  my ($boundary);
+  ($self->{'type'}, $boundary) = Mail::SpamAssassin::Util::parse_content_type($self->header('content-type'));
+  dbg("main message type: ".$self->{'type'});
+
+  # Make the tree
+  Mail::SpamAssassin::MsgParser->parse_body( $self, $self, $boundary, $toparse, 1 );
+  $self->{'already_parsed'} = 1;
+
+  dbg("---- MIME PARSER END ----");
+
+  return;
+}
+
 =item find_parts()
 
 =cut
@@ -86,10 +120,23 @@
 
   $onlyleaves = 0 unless defined $onlyleaves;
   $recursive = 1 unless defined $recursive;
+
+  # ok, we need to do the parsing now...
+  $self->_do_parse() if (!$self->{'already_parsed'});
+  
+  return $self->_find_parts($re, $onlyleaves, $recursive);
+}
+
+# We have 2 functions in find_parts() to optimize out the penalty of
+# 'already_parsed'...  It also lets us avoid checking $onlyleaves, $re,
+# and $recursive over and over again.
+#
+sub _find_parts {
+  my ($self, $re, $onlyleaves, $recursive) = @_;
   my @ret = ();
 
   # If this object matches, mark it for return.
-  my $amialeaf = !exists $self->{'body_parts'};
+  my $amialeaf = $self->is_leaf();
 
   if ( $self->{'type'} =~ /$re/ && (!$onlyleaves || $amialeaf) ) {
     push(@ret, $self);
@@ -99,7 +146,7 @@
     # This object is a subtree root.  Search all children.
     foreach my $parts ( @{$self->{'body_parts'}} ) {
       # Add the recursive results to our results
-      push(@ret, $parts->find_parts($re));
+      push(@ret, $parts->_find_parts($re));
     }
   }
 
@@ -183,10 +230,15 @@
 
 =item is_leaf()
 
+Returns true if the tree node in question is a leaf of the tree (ie:
+has no children of its own).  Note: This function may return odd results
+unless the message has been mime parsed via _do_parse()!
+
 =cut
 
 sub is_leaf {
-  return exists $_[0]->{'raw'};
+  my($self) = @_;
+  return !exists $self->{'body_parts'};
 }
 
 =item raw()
@@ -333,6 +385,10 @@
 }
 
 =item content_summary()
+
+Returns an array of scalars describing the mime parts of the message.
+Note: This function requires that the message be parsed first via
+_do_parse()!
 
 =cut
 

Modified: incubator/spamassassin/trunk/t/mimeparse.t
==============================================================================
--- incubator/spamassassin/trunk/t/mimeparse.t	(original)
+++ incubator/spamassassin/trunk/t/mimeparse.t	Tue Feb 17 22:47:19 2004
@@ -90,6 +90,10 @@
   open(INP, $k) || die "Can't find $k:$!";
   my $mail = Mail::SpamAssassin->parse(\*INP);
   close(INP);
+
+  # We know it's not parsed, so deal with it!
+  $mail->_do_parse();
+
   my $res = join("\n",$mail->content_summary());
   #print "---\n$res\n---\n";
   ok( $res eq shift @{$files{$k}} );