You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by fe...@apache.org on 2004/01/14 08:44:26 UTC

svn commit: rev 6169 - in incubator/spamassassin/trunk/lib/Mail/SpamAssassin: . MIME

Author: felicity
Date: Tue Jan 13 23:44:26 2004
New Revision: 6169

Modified:
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME.pm
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME/Parser.pm
Log:
More parser work.  Cleaned up debug statements some, got rid of
some more code that is unneeded now that we do a tree as opposed to
a pseudo-tree, resolved a potential recursion issue with leaf nodes
pointing to themselves, simplified some code to use -1 for the last
element in an array instead of "scalar @array - 1", find_parts() will
now look at the tree structure to find children instead of just looking
for children under multipart/* nodes (it's the same thing, but now it's
not tied to the content-type, just the fact there are children nodes...)



Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME.pm	Tue Jan 13 23:44:26 2004
@@ -97,8 +97,8 @@
   if ( $self->{'type'} =~ /$re/ ) {
     push(@ret, $self);
   }
-  elsif ( $self->{'type'} =~ m@^multipart/@i ) {
-    # This object is a multipart container.  Search all children.
+  elsif ( exists $self->{'body_parts'} ) {
+    # This object is a subtree root.  Search all children.
     foreach my $parts ( @{$self->{'body_parts'}} ) {
       # Add the recursive results to our results
       push(@ret, $parts->find_parts($re));
@@ -172,28 +172,6 @@
 
   dbg("added part, type: ".$part->{'type'});
   push @{ $self->{'body_parts'} }, $part;
-}
-
-sub body {
-  my $self = shift;
-  my $type = lc(shift);
-  return unless @{ $self->{body_parts} };
-  if ($type) {
-
-    # warn("body has ", scalar(@{ $self->{body_parts} }), " [$type]\n");
-    foreach my $body ( @{ $self->{body_parts} } ) {
-
-      # warn("type: $body->[0]\n");
-      if ( $type eq lc( $body->{type} ) ) {
-        return $body;
-      }
-    }
-  }
-  else {
-
-    # return first body part
-    return $self->{body_parts}[0];
-  }
 }
 
 sub dbg { Mail::SpamAssassin::dbg (@_); }

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME/Parser.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME/Parser.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME/Parser.pm	Tue Jan 13 23:44:26 2004
@@ -45,6 +45,8 @@
 sub parse {
   my($self,$message) = @_;
 
+  dbg("---- MIME PARSER START ----");
+
   # protect it from abuse ...
   local $_;
 
@@ -83,8 +85,13 @@
 
   my ($boundary);
   ($msg->{'type'}, $boundary) = Mail::SpamAssassin::Util::parse_content_type($msg->header('content-type'));
+  dbg("main message type: ".$msg->{'type'});
+
+  # Make the tree
   $self->_parse_body( $msg, $msg, $boundary, \@message, 1 );
 
+  dbg("---- MIME PARSER END ----");
+
   return $msg;
 }
 
@@ -131,12 +138,6 @@
     # If it's not multipart, go ahead and just deal with it.
     $self->_parse_normal( $msg, $_msg, $boundary, $body );
   }
-
-  if ( !$msg->body() ) {
-    dbg("No message body found. Reparsing as blank.");
-    my $part_msg = Mail::SpamAssassin::MIME->new();
-    $self->_parse_normal( $msg, $part_msg, $boundary, [] );
-  }
 }
 
 =item _parse_multipart()
@@ -167,23 +168,18 @@
     # Else, there's no boundary, so leave the whole part...
   }
 
-  my $part_msg =
-    Mail::SpamAssassin::MIME->new();    # just used for headers storage
+  my $part_msg = Mail::SpamAssassin::MIME->new();    # prepare a new tree node
   my $in_body = 0;
-
   my $header;
   my $part_array;
 
   my $line_count = @{$body};
   foreach ( @{$body} ) {
+    # if we're on the last body line, or we find a boundary marker, deal with the mime part
     if ( --$line_count == 0 || ($boundary && /^\-\-\Q$boundary\E/) ) {
+      my $line = $_; # remember the last line
 
-      # end of part
-      my $line = $_;
-      chomp;
-      dbg("Got end of MIME section: $_");
-
-      # per rfc 1521, the CRLF before the boundary is part of the boundary ...
+      # per rfc 1521, the CRLF before the boundary is part of the boundary:
       # NOTE: The CRLF preceding the encapsulation line is conceptually
       # attached to the boundary so that it is possible to have a part
       # that does not end with a CRLF (line break). Body parts that must
@@ -192,21 +188,24 @@
       # of the preceding body part, and the second of which is part of the
       # encapsulation boundary.
       if ($part_array) {
-        chomp( $part_array->[ scalar @{$part_array} - 1 ] );
-        splice @{$part_array}, -1
-          if ( $part_array->[ scalar @{$part_array} - 1 ] eq '' );
+        chomp( $part_array->[-1] );  # trim the CRLF that's part of the boundary
+        splice @{$part_array}, -1 if ( $part_array->[-1] eq '' ); # blank line for the boundary only ...
 
         my($p_boundary);
 	($part_msg->{'type'}, $p_boundary) = Mail::SpamAssassin::Util::parse_content_type($part_msg->header('content-type'));
         $p_boundary ||= $boundary;
+	dbg("found part of type ".$part_msg->{'type'}.", boundary: ".$p_boundary);
         $self->_parse_body( $msg, $part_msg, $p_boundary, $part_array, 0 );
       }
 
       last if ($boundary && $line =~ /^\-\-\Q${boundary}\E\-\-$/);
+
+      # make sure we start with a new clean node
       $in_body  = 0;
       $part_msg = Mail::SpamAssassin::MIME->new();
       undef $part_array;
       undef $header;
+
       next;
     }
 
@@ -247,12 +246,8 @@
 sub _parse_normal {
   my ($self, $msg, $part_msg, $boundary, $body) = @_;
 
-  dbg("parsing normal".(defined $boundary ? ", got boundary: $boundary":""));
-  delete $part_msg->{body_parts}; # single parts don't need a body_parts piece ...
-
-  dbg("decoding attachment");
+  dbg("parsing normal, decoding attachment");
   my ($type, $decoded, $name) = $self->_decode($part_msg, $body);
-  dbg("decoded $type");
 
   $part_msg->{'type'} = $type;
   $part_msg->{'decoded'} = $decoded;
@@ -266,6 +261,14 @@
   }
 
   $msg->add_body_part($part_msg);
+
+  # now that we've added the leaf node, let's go ahead and kill
+  # body_parts (used for sub-trees).  it could end up being recursive,
+  # and well, let's avoid that. ;)
+  #
+  # BTW: please leave this after add_body_parts() since it'll add it back.
+  #
+  delete $part_msg->{body_parts};
 }
 
 sub __decode_header {
@@ -317,15 +320,17 @@
     ($filename) = ( $type =~ /name="?([^\";]+)"?/i );
   }
 
-  if ( lc( $msg->header('content-transfer-encoding') ) eq 'quoted-printable' ) {
-    dbg("decoding QP file");
+  my $encoding = lc $msg->header('content-transfer-encoding') || '';
+
+  if ( $encoding eq 'quoted-printable' ) {
+    dbg("decoding: quoted-printable");
     my @output =
       map { s/\r\n/\n/; $_; } split ( /^/m, Mail::SpamAssassin::Util::qp_decode( join ( "", @{$body} ) ) );
 
     return $type, \@output, $filename;
   }
-  elsif ( lc( $msg->header('content-transfer-encoding') ) eq 'base64' ) {
-    dbg("decoding B64 file");
+  elsif ( $encoding eq 'base64' ) {
+    dbg("decoding: base64");
 
     # Generate the decoded output
     my $output = [ Mail::SpamAssassin::Util::base64_decode(join("", @{$body})) ];
@@ -337,7 +342,12 @@
   }
   else {
     # Encoding is one of 7bit, 8bit, binary or x-something
-    dbg("decoding other encoding");
+    if ( $encoding ) {
+      dbg("decoding: other encoding type ($encoding), ignoring");
+    }
+    else {
+      dbg("decoding: no encoding detected");
+    }
 
     # No encoding, so just point to the raw data ...
     return $type, $body, $filename;