You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by fe...@apache.org on 2004/01/14 07:12:44 UTC

svn commit: rev 6165 - in incubator/spamassassin/trunk/lib/Mail/SpamAssassin: . MIME

Author: felicity
Date: Tue Jan 13 22:12:43 2004
New Revision: 6165

Modified:
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME.pm
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME/Parser.pm
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm
Log:
More parser work.  The parser now handles all of my test messages
including rather annoying corner cases.  I added some comment/pod
documentation so the process is a little less cloudy (I'm still not quite
sure where tree nodes are created ...)  I got rid of a bunch of duplicated
code and made some of the parser more generic.  The tree that's made is
now actually a full tree instead of a sorta-tree (root nodes had leaf
data, but no actual leaves ...)  Renamed private functions to have an
underscore in front of their names.  Implemented a find_parts() function
to search the tree and return leaves/sub-trees that match.  Phew!


Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME.pm	Tue Jan 13 22:12:43 2004
@@ -64,6 +64,8 @@
 use MIME::Base64;
 use Mail::SpamAssassin;
 
+# M::SA::MIME is an object method used to encapsulate a message's MIME part
+#
 sub new {
   my $class = shift;
   $class = ref($class) || $class;
@@ -79,6 +81,35 @@
   $self;
 }
 
+# Used to find any MIME parts whose simple content-type matches a given regexp
+# Searches it's own and any children parts.  Returns an array of MIME
+# objects which match.
+#
+sub find_parts {
+  my ($self, $re) = @_;
+
+  # Didn't pass an RE?  Just abort.
+  return () unless $re;
+
+  my @ret = ();
+
+  # If this object matches, mark it for return.
+  if ( $self->{'type'} =~ /$re/ ) {
+    push(@ret, $self);
+  }
+  elsif ( $self->{'type'} =~ m@^multipart/@i ) {
+    # This object is a multipart container.  Search all children.
+    foreach my $parts ( @{$self->{'body_parts'}} ) {
+      # Add the recursive results to our results
+      push(@ret, $parts->find_parts($re));
+    }
+  }
+
+  return @ret;
+}
+
+# Store or retrieve headers from a given MIME object
+#
 sub header {
   my $self   = shift;
   my $rawkey = shift;
@@ -91,30 +122,32 @@
   if (@_) {
     my ( $decoded_value, $raw_value ) = @_;
     $raw_value = $decoded_value unless defined $raw_value;
-    if ( exists $self->{headers}{$key} ) {
-      push @{ $self->{headers}{$key} },     $decoded_value;
-      push @{ $self->{raw_headers}{$key} }, $raw_value;
+    if ( exists $self->{'headers'}{$key} ) {
+      push @{ $self->{'headers'}{$key} },     $decoded_value;
+      push @{ $self->{'raw_headers'}{$key} }, $raw_value;
     }
     else {
-      $self->{headers}{$key}     = [$decoded_value];
-      $self->{raw_headers}{$key} = [$raw_value];
+      $self->{'headers'}{$key}     = [$decoded_value];
+      $self->{'raw_headers'}{$key} = [$raw_value];
     }
-    return $self->{headers}{$key}[-1];
+    return $self->{'headers'}{$key}[-1];
   }
 
   my $want = wantarray;
   if ( defined($want) ) {
     if ($want) {
-      return unless exists $self->{headers}{$key};
-      return @{ $self->{headers}{$key} };
+      return unless exists $self->{'headers'}{$key};
+      return @{ $self->{'headers'}{$key} };
     }
     else {
-      return '' unless exists $self->{headers}{$key};
-      return $self->{headers}{$key}[-1];
+      return '' unless exists $self->{'headers'}{$key};
+      return $self->{'headers'}{$key}[-1];
     }
   }
 }
 
+# Retrieve raw headers from a given MIME object
+#
 sub raw_header {
   my $self = shift;
   my $key  = lc(shift);
@@ -124,36 +157,21 @@
   $key       =~ s/\s+$//;
 
   if (wantarray) {
-    return unless exists $self->{raw_headers}{$key};
-    return @{ $self->{raw_headers}{$key} };
+    return unless exists $self->{'raw_headers'}{$key};
+    return @{ $self->{'raw_headers'}{$key} };
   }
   else {
-    return '' unless exists $self->{raw_headers}{$key};
-    return $self->{raw_headers}{$key}[-1];
+    return '' unless exists $self->{'raw_headers'}{$key};
+    return $self->{'raw_headers'}{$key}[-1];
   }
 }
 
+# Add a MIME child part to ourselves
 sub add_body_part {
-  my($self, $raw_type, $opts) = @_;
-
-  my $type = $raw_type;
-  $type     ||= 'text/plain';
-  $type =~ s/;.*$//;            	# strip everything after first semi-colon
-  $type =~ s@^([^/]+/[^/]+).*$@$1@;	# only something/something ...
-  $type =~ tr!\000-\040\177-\377\042\050\051\054\056\072-\077\100\133-\135!!d;    # strip inappropriate chars
-
-  my $part = {
-    type     => $type,
-  };
+  my($self, $part) = @_;
 
-  while( my($k,$v) = each %{$opts} ) {
-    $part->{$k} = $v;
-  }
-
-  dbg("added part, type: $type");
-
-  # Add the part to body_parts
-  push @{ $self->{body_parts} }, $part;
+  dbg("added part, type: ".$part->{'type'});
+  push @{ $self->{'body_parts'} }, $part;
 }
 
 sub body {
@@ -176,11 +194,6 @@
     # return first body part
     return $self->{body_parts}[0];
   }
-}
-
-sub bodies {
-  my $self = shift;
-  return @{ $self->{body_parts} };
 }
 
 sub dbg { Mail::SpamAssassin::dbg (@_); }

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME/Parser.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME/Parser.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME/Parser.pm	Tue Jan 13 22:12:43 2004
@@ -33,23 +33,29 @@
 To use it, simply call C<Mail::SpamAssassin::MIME::Parser->parse($msg)>,
 where $msg is a scalar with the entire contents of the mesage.
 
-More information should go here. ;)
+The procedure used to parse a message is recursive and ends up generating
+a tree of M::SA::MIME objects.  parse() will generate the parent node
+of the tree, then pass the body of the message to _parse_body() which begins
+the recursive process.
+
+This is the only public method available!
 
 =cut
 
-# constructor
 sub parse {
   my($self,$message) = @_;
 
-  # now go generate stuff
+  # protect it from abuse ...
+  local $_;
+
+  # Split the scalar into an array of lines
   my @message = split ( /^/m, $message );
-  # trim mbox seperator
+
+  # trim mbox seperator if it exists
   shift @message if ( scalar @message > 0 && $message[0] =~ /^From\s/ );
 
+  # Generate the main object and parse the appropriate MIME-related headers into it.
   my $msg = Mail::SpamAssassin::MIME->new();
-
-  local $_;                                          # protect from abuse
-
   my $header = '';
 
   while ( my $last = shift @message ) {
@@ -75,28 +81,28 @@
     last if ( $last =~ /^$/m );
   }
 
-  # Parse out the body ...
-  # the actual ABNF, BTW:
-  # boundary := 0*69<bchars> bcharsnospace
-  # bchars := bcharsnospace / " "
-  # bcharsnospace :=    DIGIT / ALPHA / "'" / "(" / ")" / "+" /"_"
-  #               / "," / "-" / "." / "/" / ":" / "=" / "?"
-  #
-  my ($boundary) =
-    $msg->header('content-type') =~ /boundary\s*=\s*["']?([^"';]+)["']?/i;
+  my ($boundary);
+  ($msg->{'type'}, $boundary) = Mail::SpamAssassin::Util::parse_content_type($msg->header('content-type'));
   $self->_parse_body( $msg, $msg, $boundary, \@message, 1 );
 
-  unless ( $msg->{'type'} ) {
-    $msg->{'type'} = $msg->header('content-type');
-    $msg->{'type'} ||= 'text/plain';
-    $msg->{'type'} =~ s/;.*$//;                    # strip everything after first semi-colon
-    $msg->{'type'} =~ s@^([^/]+/[^/]+).*$@$1@;     # only something/something ...
-    $msg->{'type'} =~ tr!\000-\040\177-\377\042\050\051\054\056\072-\077\100\133-\135!!d;    # strip inappropriate chars
-  }
-
   return $msg;
 }
 
+=item _parse_body()
+
+_parse_body() passes the body part that was passed in onto the
+correct part parser, either _parse_multipart() for multipart/* parts,
+or _parse_normal() for everything else.  Multipart sections become the
+root of sub-trees, while everything else becomes a leaf in the tree.
+
+For multipart messages, the first call to _parse_body() doesn't create a
+new sub-tree and just uses the parent node to contain children.  All other
+calls to _parse_body() will cause a new sub-tree root to be created and
+children will exist underneath that root.  (this is just so the tree
+doesn't have a root node which points at the actual root node ...)
+
+=cut
+
 sub _parse_body {
   my($self, $msg, $_msg, $boundary, $body, $initial) = @_;
 
@@ -105,45 +111,45 @@
     s/\r\n/\n/;
   }
 
+  # Figure out the simple content-type, or set it to text/plain
   my $type = $_msg->header('Content-Type') || 'text/plain; charset=us-ascii';
 
-  #    warn "Parsing message of type: $type\n";
-
-  if ( $type =~ /^text\/plain/i ) {
-    dbg("Parse text/plain");
-    $self->_parse_normal( $msg, $_msg, $boundary, $body );
-  }
-  elsif ( $type =~ /^text\/html/i ) {
-    dbg("Parse text/html");
-    $self->_parse_normal( $msg, $_msg, $boundary, $body );
-  }
-  elsif ( $type =~ /^multipart\//i ) {
-    dbg("Parse $type");
+  if ( $type =~ /^multipart\//i ) {
+    # Treat an initial multipart parse differently.  This will keep the tree:
+    # obj(multipart->[ part1, part2 ]) instead of
+    # obj(obj(multipart ...))
+    #
     if ( $initial ) {
       $self->_parse_multipart( $msg, $_msg, $boundary, $body );
     }
     else {
       $self->_parse_multipart( $_msg, $_msg, $boundary, $body );
-      $msg->add_body_part( $type, $_msg );
+      $msg->add_body_part( $_msg );
     }
   }
   else {
-    dbg("Regular attachment");
-    $self->_decode_body( $msg, $_msg, $boundary, $body );
+    # If it's not multipart, go ahead and just deal with it.
+    $self->_parse_normal( $msg, $_msg, $boundary, $body );
   }
 
   if ( !$msg->body() ) {
     dbg("No message body found. Reparsing as blank.");
     my $part_msg = Mail::SpamAssassin::MIME->new();
-    $self->_decode_body( $msg, $part_msg, $boundary, [] );
+    $self->_parse_normal( $msg, $part_msg, $boundary, [] );
   }
 }
 
+=item _parse_multipart()
+
+Generate a root node, and for each child part call _parse_body().
+
+=cut
+
 sub _parse_multipart {
   my($self, $msg, $_msg, $boundary, $body) = @_;
 
   $boundary ||= '';
-  dbg("m/m got boundary: $boundary");
+  dbg("parsing multipart, got boundary: $boundary");
 
   # ignore preamble per RFC 1521, unless there's no boundary ...
   if ( $boundary ) {
@@ -173,8 +179,9 @@
     if ( --$line_count == 0 || ($boundary && /^\-\-\Q$boundary\E/) ) {
 
       # end of part
-      dbg("Got end of MIME section: $_");
       my $line = $_;
+      chomp;
+      dbg("Got end of MIME section: $_");
 
       # per rfc 1521, the CRLF before the boundary is part of the boundary ...
       # NOTE: The CRLF preceding the encapsulation line is conceptually
@@ -189,9 +196,8 @@
         splice @{$part_array}, -1
           if ( $part_array->[ scalar @{$part_array} - 1 ] eq '' );
 
-        my ($p_boundary) =
-          $part_msg->header('content-type') =~
-          /boundary\s*=\s*["']?([^"';]+)["']?/i;
+        my($p_boundary);
+	($part_msg->{'type'}, $p_boundary) = Mail::SpamAssassin::Util::parse_content_type($part_msg->header('content-type'));
         $p_boundary ||= $boundary;
         $self->_parse_body( $msg, $part_msg, $p_boundary, $part_array, 0 );
       }
@@ -232,11 +238,34 @@
 
 }
 
+=item _parse_normal()
+
+Generate a leaf node and add it to the parent.
+
+=cut
+
 sub _parse_normal {
-  my($self, $msg, $_msg, $boundary, $body) = @_;
+  my ($self, $msg, $part_msg, $boundary, $body) = @_;
+
+  dbg("parsing normal".(defined $boundary ? ", got boundary: $boundary":""));
+  delete $part_msg->{body_parts}; # single parts don't need a body_parts piece ...
+
+  dbg("decoding attachment");
+  my ($type, $decoded, $name) = $self->_decode($part_msg, $body);
+  dbg("decoded $type");
+
+  $part_msg->{'type'} = $type;
+  $part_msg->{'decoded'} = $decoded;
+  $part_msg->{'raw'} = $body;
+  $part_msg->{'boundary'} = $boundary;
+  $part_msg->{'name'} = $name if $name;
+
+  # If the message is a text/* type, then try rendering it...
+  if ( $type =~ /^text\b/i ) {
+    ($part_msg->{'rendered'}, $part_msg->{'rendered_type'}) = _render_text($type, $decoded);
+  }
 
-  # extract body, store it in $msg
-  $self->_decode_body( $msg, $_msg, $boundary, $body );
+  $msg->add_body_part($part_msg);
 }
 
 sub __decode_header {
@@ -255,7 +284,12 @@
   }
 }
 
-# decode according to RFC2047
+=item _decode_header()
+
+Decode base64 and quoted-printable in headers according to RFC2047.
+
+=cut
+
 sub _decode_header {
   my($self, $header) = @_;
 
@@ -267,43 +301,27 @@
   return $header;
 }
 
-sub _decode_body {
-  my ($self, $msg, $part_msg, $boundary, $body) = @_;
-
-  dbg("decoding attachment");
-
-  my ($type, $decoded, $name) = $self->_decode($part_msg, $body);
-
-  dbg("decoded attachment type: $type");
+=item _decode()
 
-  my $opts = {
-  	decoded => $decoded,
-	raw => $body,
-	boundary => $boundary,
-	headers => $part_msg->{headers},
-	raw_headers => $part_msg->{raw_headers},
-  };
-  $opts->{name} = $name if $name;
-  $opts->{rendered} = _render_text($type, $decoded) if $type =~ /^text/i;
+Decode base64 and quoted-printable parts.
 
-  $msg->add_body_part( $type, $opts );
-}
+=cut
 
 sub _decode {
   my($self, $msg, $body ) = @_;
 
+  my($type) = Mail::SpamAssassin::Util::parse_content_type($msg->header('content-type'));
+  my ($filename) =
+    ( $msg->header('content-disposition') =~ /name="?([^\";]+)"?/i );
+  if ( !$filename ) {
+    ($filename) = ( $type =~ /name="?([^\";]+)"?/i );
+  }
+
   if ( lc( $msg->header('content-transfer-encoding') ) eq 'quoted-printable' ) {
     dbg("decoding QP file");
     my @output =
       map { s/\r\n/\n/; $_; } split ( /^/m, Mail::SpamAssassin::Util::qp_decode( join ( "", @{$body} ) ) );
 
-    my $type = $msg->header('content-type');
-    my ($filename) =
-      ( $msg->header('content-disposition') =~ /name="?([^\";]+)"?/i );
-    if ( !$filename ) {
-      ($filename) = ( $type =~ /name="?([^\";]+)"?/i );
-    }
-
     return $type, \@output, $filename;
   }
   elsif ( lc( $msg->header('content-transfer-encoding') ) eq 'base64' ) {
@@ -312,14 +330,6 @@
     # Generate the decoded output
     my $output = [ Mail::SpamAssassin::Util::base64_decode(join("", @{$body})) ];
 
-    # If it has a filename, figure it out.
-    my $type = $msg->header('content-type');
-    my ($filename) =
-      ( $msg->header('content-disposition') =~ /name="?([^\";]+)"?/i );
-    if ( !$filename ) {
-      ($filename) = ( $type =~ /name="?([^\";]+)"?/i );
-    }
-
     # If it's a type text or message, split it into an array of lines
     $output = [ map { s/\r\n/\n/; $_; } split(/^/m, $output->[0]) ] if ( $type =~ m@^(?:text|message)/@ );
 
@@ -329,21 +339,20 @@
     # Encoding is one of 7bit, 8bit, binary or x-something
     dbg("decoding other encoding");
 
-    my $type = $msg->header('content-type');
-    my ($filename) =
-      ( $msg->header('content-disposition') =~ /name="?([^\";]+)"?/i );
-    if ( !$filename ) {
-      ($filename) = ( $type =~ /name="?([^\";]+)"?/i );
-    }
-
     # No encoding, so just point to the raw data ...
     return $type, $body, $filename;
   }
 }
 
-# render text/plain as text/html based on a heuristic which simulates
-# a certain common mail client
-sub html_near_start {
+=item _html_near_start()
+
+Look at a text scalar and determine whether it should be rendered
+as text/html.  Based on a heuristic which simulates a certain common
+mail client.
+
+=cut
+
+sub _html_near_start {
   my ($pad) = @_;
 
   my $count = 0;
@@ -352,6 +361,19 @@
   return ($count < 24);
 }
 
+=item _render_text()
+
+_render_text() takes the given text/* type MIME part, and attempt
+to render it into a text scalar.  It will always render text/html,
+and will use a heuristic to determine if other text/* parts should be
+considered text/html.
+
+Pass in the content-type and the decoded part array.  Returns a scalar
+with the rendered data and the "rendered_as" content-type (same as passed
+in for no rendering, "text/html" for HTML).
+
+=cut
+
 sub _render_text {
   my ($type, $decoded) = @_;
 
@@ -361,16 +383,16 @@
   # on a heuristic which simulates a certain common mail client
   if ( $type =~ m@^text/html\b@i ||
       ($text =~ m/^(.{0,18}?<(?:$Mail::SpamAssassin::HTML::re_start)(?:\s.{0,18}?)?>)/ois &&
-	html_near_start($1)
+	_html_near_start($1)
       )
      ) {
     my $html = Mail::SpamAssassin::HTML->new();		# object
     my $html_rendered = $html->html_render($text);	# rendered text
     my $html_results = $html->get_results();		# needed in eval tests
-    return join('', @{ $html_rendered });
+    return ( join('', @{ $html_rendered }), 'text/html' );
   }
   else {
-    return $text;
+    return ( $text, $type );
   }
 }
 

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm	Tue Jan 13 22:12:43 2004
@@ -598,6 +598,33 @@
 
 ###########################################################################
 
+sub parse_content_type {
+  # This routine is typically called by passing a
+  # get_header("content-type") which passes all content-type headers
+  # (array context).  If there are multiple Content-type headers (invalid,
+  # but it happens), MUAs seem to take the last one and so that's what we
+  # should do here.
+  #
+  my $ct = $_[-1] || 'text/plain; charset=us-ascii';
+
+  # This could be made a bit more rigid ...
+  # the actual ABNF, BTW:
+  # boundary := 0*69<bchars> bcharsnospace
+  # bchars := bcharsnospace / " "
+  # bcharsnospace :=    DIGIT / ALPHA / "'" / "(" / ")" / "+" /"_"
+  #               / "," / "-" / "." / "/" / ":" / "=" / "?"
+  #
+  my ($boundary) = $ct =~ /boundary\s*=\s*["']?([^"';]+)["']?/i;
+
+  # Get the type out ...
+  $ct =~ s/;.*$//;                    # strip everything after first semi-colon
+  $ct =~ s@^([^/]+(?:/[^/]*)?).*$@$1@;	# only something/something ...
+  $ct =~ tr!\000-\040\177-\377\042\050\051\054\056\072-\077\100\133-\135!!d;    # strip inappropriate chars
+
+  return ($ct,$boundary);
+}
+
+###########################################################################
 sub dbg { Mail::SpamAssassin::dbg (@_); }
 
 1;