You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by fe...@apache.org on 2004/01/14 07:12:44 UTC
svn commit: rev 6165 - in incubator/spamassassin/trunk/lib/Mail/SpamAssassin: . MIME
Author: felicity
Date: Tue Jan 13 22:12:43 2004
New Revision: 6165
Modified:
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME.pm
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME/Parser.pm
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm
Log:
More parser work. The parser now handles all of my test messages
including rather annoying corner cases. I added some comment/pod
documentation so the process is a little less cloudy (I'm still not quite
sure where tree nodes are created ...) I got rid of a bunch of duplicated
code and made some of the parser more generic. The tree that's made is
now actually a full tree instead of a sorta-tree (root nodes had leaf
data, but no actual leaves ...) Renamed private functions to have an
underscore in front of their names. Implemented a find_parts() function
to search the tree and return leaves/sub-trees that match. Phew!
Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME.pm Tue Jan 13 22:12:43 2004
@@ -64,6 +64,8 @@
use MIME::Base64;
use Mail::SpamAssassin;
+# M::SA::MIME is an object method used to encapsulate a message's MIME part
+#
sub new {
my $class = shift;
$class = ref($class) || $class;
@@ -79,6 +81,35 @@
$self;
}
+# Used to find any MIME parts whose simple content-type matches a given regexp
+# Searches it's own and any children parts. Returns an array of MIME
+# objects which match.
+#
+sub find_parts {
+ my ($self, $re) = @_;
+
+ # Didn't pass an RE? Just abort.
+ return () unless $re;
+
+ my @ret = ();
+
+ # If this object matches, mark it for return.
+ if ( $self->{'type'} =~ /$re/ ) {
+ push(@ret, $self);
+ }
+ elsif ( $self->{'type'} =~ m@^multipart/@i ) {
+ # This object is a multipart container. Search all children.
+ foreach my $parts ( @{$self->{'body_parts'}} ) {
+ # Add the recursive results to our results
+ push(@ret, $parts->find_parts($re));
+ }
+ }
+
+ return @ret;
+}
+
+# Store or retrieve headers from a given MIME object
+#
sub header {
my $self = shift;
my $rawkey = shift;
@@ -91,30 +122,32 @@
if (@_) {
my ( $decoded_value, $raw_value ) = @_;
$raw_value = $decoded_value unless defined $raw_value;
- if ( exists $self->{headers}{$key} ) {
- push @{ $self->{headers}{$key} }, $decoded_value;
- push @{ $self->{raw_headers}{$key} }, $raw_value;
+ if ( exists $self->{'headers'}{$key} ) {
+ push @{ $self->{'headers'}{$key} }, $decoded_value;
+ push @{ $self->{'raw_headers'}{$key} }, $raw_value;
}
else {
- $self->{headers}{$key} = [$decoded_value];
- $self->{raw_headers}{$key} = [$raw_value];
+ $self->{'headers'}{$key} = [$decoded_value];
+ $self->{'raw_headers'}{$key} = [$raw_value];
}
- return $self->{headers}{$key}[-1];
+ return $self->{'headers'}{$key}[-1];
}
my $want = wantarray;
if ( defined($want) ) {
if ($want) {
- return unless exists $self->{headers}{$key};
- return @{ $self->{headers}{$key} };
+ return unless exists $self->{'headers'}{$key};
+ return @{ $self->{'headers'}{$key} };
}
else {
- return '' unless exists $self->{headers}{$key};
- return $self->{headers}{$key}[-1];
+ return '' unless exists $self->{'headers'}{$key};
+ return $self->{'headers'}{$key}[-1];
}
}
}
+# Retrieve raw headers from a given MIME object
+#
sub raw_header {
my $self = shift;
my $key = lc(shift);
@@ -124,36 +157,21 @@
$key =~ s/\s+$//;
if (wantarray) {
- return unless exists $self->{raw_headers}{$key};
- return @{ $self->{raw_headers}{$key} };
+ return unless exists $self->{'raw_headers'}{$key};
+ return @{ $self->{'raw_headers'}{$key} };
}
else {
- return '' unless exists $self->{raw_headers}{$key};
- return $self->{raw_headers}{$key}[-1];
+ return '' unless exists $self->{'raw_headers'}{$key};
+ return $self->{'raw_headers'}{$key}[-1];
}
}
+# Add a MIME child part to ourselves
sub add_body_part {
- my($self, $raw_type, $opts) = @_;
-
- my $type = $raw_type;
- $type ||= 'text/plain';
- $type =~ s/;.*$//; # strip everything after first semi-colon
- $type =~ s@^([^/]+/[^/]+).*$@$1@; # only something/something ...
- $type =~ tr!\000-\040\177-\377\042\050\051\054\056\072-\077\100\133-\135!!d; # strip inappropriate chars
-
- my $part = {
- type => $type,
- };
+ my($self, $part) = @_;
- while( my($k,$v) = each %{$opts} ) {
- $part->{$k} = $v;
- }
-
- dbg("added part, type: $type");
-
- # Add the part to body_parts
- push @{ $self->{body_parts} }, $part;
+ dbg("added part, type: ".$part->{'type'});
+ push @{ $self->{'body_parts'} }, $part;
}
sub body {
@@ -176,11 +194,6 @@
# return first body part
return $self->{body_parts}[0];
}
-}
-
-sub bodies {
- my $self = shift;
- return @{ $self->{body_parts} };
}
sub dbg { Mail::SpamAssassin::dbg (@_); }
Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME/Parser.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME/Parser.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME/Parser.pm Tue Jan 13 22:12:43 2004
@@ -33,23 +33,29 @@
To use it, simply call C<Mail::SpamAssassin::MIME::Parser->parse($msg)>,
where $msg is a scalar with the entire contents of the mesage.
-More information should go here. ;)
+The procedure used to parse a message is recursive and ends up generating
+a tree of M::SA::MIME objects. parse() will generate the parent node
+of the tree, then pass the body of the message to _parse_body() which begins
+the recursive process.
+
+This is the only public method available!
=cut
-# constructor
sub parse {
my($self,$message) = @_;
- # now go generate stuff
+ # protect it from abuse ...
+ local $_;
+
+ # Split the scalar into an array of lines
my @message = split ( /^/m, $message );
- # trim mbox seperator
+
+ # trim mbox seperator if it exists
shift @message if ( scalar @message > 0 && $message[0] =~ /^From\s/ );
+ # Generate the main object and parse the appropriate MIME-related headers into it.
my $msg = Mail::SpamAssassin::MIME->new();
-
- local $_; # protect from abuse
-
my $header = '';
while ( my $last = shift @message ) {
@@ -75,28 +81,28 @@
last if ( $last =~ /^$/m );
}
- # Parse out the body ...
- # the actual ABNF, BTW:
- # boundary := 0*69<bchars> bcharsnospace
- # bchars := bcharsnospace / " "
- # bcharsnospace := DIGIT / ALPHA / "'" / "(" / ")" / "+" /"_"
- # / "," / "-" / "." / "/" / ":" / "=" / "?"
- #
- my ($boundary) =
- $msg->header('content-type') =~ /boundary\s*=\s*["']?([^"';]+)["']?/i;
+ my ($boundary);
+ ($msg->{'type'}, $boundary) = Mail::SpamAssassin::Util::parse_content_type($msg->header('content-type'));
$self->_parse_body( $msg, $msg, $boundary, \@message, 1 );
- unless ( $msg->{'type'} ) {
- $msg->{'type'} = $msg->header('content-type');
- $msg->{'type'} ||= 'text/plain';
- $msg->{'type'} =~ s/;.*$//; # strip everything after first semi-colon
- $msg->{'type'} =~ s@^([^/]+/[^/]+).*$@$1@; # only something/something ...
- $msg->{'type'} =~ tr!\000-\040\177-\377\042\050\051\054\056\072-\077\100\133-\135!!d; # strip inappropriate chars
- }
-
return $msg;
}
+=item _parse_body()
+
+_parse_body() passes the body part that was passed in onto the
+correct part parser, either _parse_multipart() for multipart/* parts,
+or _parse_normal() for everything else. Multipart sections become the
+root of sub-trees, while everything else becomes a leaf in the tree.
+
+For multipart messages, the first call to _parse_body() doesn't create a
+new sub-tree and just uses the parent node to contain children. All other
+calls to _parse_body() will cause a new sub-tree root to be created and
+children will exist underneath that root. (this is just so the tree
+doesn't have a root node which points at the actual root node ...)
+
+=cut
+
sub _parse_body {
my($self, $msg, $_msg, $boundary, $body, $initial) = @_;
@@ -105,45 +111,45 @@
s/\r\n/\n/;
}
+ # Figure out the simple content-type, or set it to text/plain
my $type = $_msg->header('Content-Type') || 'text/plain; charset=us-ascii';
- # warn "Parsing message of type: $type\n";
-
- if ( $type =~ /^text\/plain/i ) {
- dbg("Parse text/plain");
- $self->_parse_normal( $msg, $_msg, $boundary, $body );
- }
- elsif ( $type =~ /^text\/html/i ) {
- dbg("Parse text/html");
- $self->_parse_normal( $msg, $_msg, $boundary, $body );
- }
- elsif ( $type =~ /^multipart\//i ) {
- dbg("Parse $type");
+ if ( $type =~ /^multipart\//i ) {
+ # Treat an initial multipart parse differently. This will keep the tree:
+ # obj(multipart->[ part1, part2 ]) instead of
+ # obj(obj(multipart ...))
+ #
if ( $initial ) {
$self->_parse_multipart( $msg, $_msg, $boundary, $body );
}
else {
$self->_parse_multipart( $_msg, $_msg, $boundary, $body );
- $msg->add_body_part( $type, $_msg );
+ $msg->add_body_part( $_msg );
}
}
else {
- dbg("Regular attachment");
- $self->_decode_body( $msg, $_msg, $boundary, $body );
+ # If it's not multipart, go ahead and just deal with it.
+ $self->_parse_normal( $msg, $_msg, $boundary, $body );
}
if ( !$msg->body() ) {
dbg("No message body found. Reparsing as blank.");
my $part_msg = Mail::SpamAssassin::MIME->new();
- $self->_decode_body( $msg, $part_msg, $boundary, [] );
+ $self->_parse_normal( $msg, $part_msg, $boundary, [] );
}
}
+=item _parse_multipart()
+
+Generate a root node, and for each child part call _parse_body().
+
+=cut
+
sub _parse_multipart {
my($self, $msg, $_msg, $boundary, $body) = @_;
$boundary ||= '';
- dbg("m/m got boundary: $boundary");
+ dbg("parsing multipart, got boundary: $boundary");
# ignore preamble per RFC 1521, unless there's no boundary ...
if ( $boundary ) {
@@ -173,8 +179,9 @@
if ( --$line_count == 0 || ($boundary && /^\-\-\Q$boundary\E/) ) {
# end of part
- dbg("Got end of MIME section: $_");
my $line = $_;
+ chomp;
+ dbg("Got end of MIME section: $_");
# per rfc 1521, the CRLF before the boundary is part of the boundary ...
# NOTE: The CRLF preceding the encapsulation line is conceptually
@@ -189,9 +196,8 @@
splice @{$part_array}, -1
if ( $part_array->[ scalar @{$part_array} - 1 ] eq '' );
- my ($p_boundary) =
- $part_msg->header('content-type') =~
- /boundary\s*=\s*["']?([^"';]+)["']?/i;
+ my($p_boundary);
+ ($part_msg->{'type'}, $p_boundary) = Mail::SpamAssassin::Util::parse_content_type($part_msg->header('content-type'));
$p_boundary ||= $boundary;
$self->_parse_body( $msg, $part_msg, $p_boundary, $part_array, 0 );
}
@@ -232,11 +238,34 @@
}
+=item _parse_normal()
+
+Generate a leaf node and add it to the parent.
+
+=cut
+
sub _parse_normal {
- my($self, $msg, $_msg, $boundary, $body) = @_;
+ my ($self, $msg, $part_msg, $boundary, $body) = @_;
+
+ dbg("parsing normal".(defined $boundary ? ", got boundary: $boundary":""));
+ delete $part_msg->{body_parts}; # single parts don't need a body_parts piece ...
+
+ dbg("decoding attachment");
+ my ($type, $decoded, $name) = $self->_decode($part_msg, $body);
+ dbg("decoded $type");
+
+ $part_msg->{'type'} = $type;
+ $part_msg->{'decoded'} = $decoded;
+ $part_msg->{'raw'} = $body;
+ $part_msg->{'boundary'} = $boundary;
+ $part_msg->{'name'} = $name if $name;
+
+ # If the message is a text/* type, then try rendering it...
+ if ( $type =~ /^text\b/i ) {
+ ($part_msg->{'rendered'}, $part_msg->{'rendered_type'}) = _render_text($type, $decoded);
+ }
- # extract body, store it in $msg
- $self->_decode_body( $msg, $_msg, $boundary, $body );
+ $msg->add_body_part($part_msg);
}
sub __decode_header {
@@ -255,7 +284,12 @@
}
}
-# decode according to RFC2047
+=item _decode_header()
+
+Decode base64 and quoted-printable in headers according to RFC2047.
+
+=cut
+
sub _decode_header {
my($self, $header) = @_;
@@ -267,43 +301,27 @@
return $header;
}
-sub _decode_body {
- my ($self, $msg, $part_msg, $boundary, $body) = @_;
-
- dbg("decoding attachment");
-
- my ($type, $decoded, $name) = $self->_decode($part_msg, $body);
-
- dbg("decoded attachment type: $type");
+=item _decode()
- my $opts = {
- decoded => $decoded,
- raw => $body,
- boundary => $boundary,
- headers => $part_msg->{headers},
- raw_headers => $part_msg->{raw_headers},
- };
- $opts->{name} = $name if $name;
- $opts->{rendered} = _render_text($type, $decoded) if $type =~ /^text/i;
+Decode base64 and quoted-printable parts.
- $msg->add_body_part( $type, $opts );
-}
+=cut
sub _decode {
my($self, $msg, $body ) = @_;
+ my($type) = Mail::SpamAssassin::Util::parse_content_type($msg->header('content-type'));
+ my ($filename) =
+ ( $msg->header('content-disposition') =~ /name="?([^\";]+)"?/i );
+ if ( !$filename ) {
+ ($filename) = ( $type =~ /name="?([^\";]+)"?/i );
+ }
+
if ( lc( $msg->header('content-transfer-encoding') ) eq 'quoted-printable' ) {
dbg("decoding QP file");
my @output =
map { s/\r\n/\n/; $_; } split ( /^/m, Mail::SpamAssassin::Util::qp_decode( join ( "", @{$body} ) ) );
- my $type = $msg->header('content-type');
- my ($filename) =
- ( $msg->header('content-disposition') =~ /name="?([^\";]+)"?/i );
- if ( !$filename ) {
- ($filename) = ( $type =~ /name="?([^\";]+)"?/i );
- }
-
return $type, \@output, $filename;
}
elsif ( lc( $msg->header('content-transfer-encoding') ) eq 'base64' ) {
@@ -312,14 +330,6 @@
# Generate the decoded output
my $output = [ Mail::SpamAssassin::Util::base64_decode(join("", @{$body})) ];
- # If it has a filename, figure it out.
- my $type = $msg->header('content-type');
- my ($filename) =
- ( $msg->header('content-disposition') =~ /name="?([^\";]+)"?/i );
- if ( !$filename ) {
- ($filename) = ( $type =~ /name="?([^\";]+)"?/i );
- }
-
# If it's a type text or message, split it into an array of lines
$output = [ map { s/\r\n/\n/; $_; } split(/^/m, $output->[0]) ] if ( $type =~ m@^(?:text|message)/@ );
@@ -329,21 +339,20 @@
# Encoding is one of 7bit, 8bit, binary or x-something
dbg("decoding other encoding");
- my $type = $msg->header('content-type');
- my ($filename) =
- ( $msg->header('content-disposition') =~ /name="?([^\";]+)"?/i );
- if ( !$filename ) {
- ($filename) = ( $type =~ /name="?([^\";]+)"?/i );
- }
-
# No encoding, so just point to the raw data ...
return $type, $body, $filename;
}
}
-# render text/plain as text/html based on a heuristic which simulates
-# a certain common mail client
-sub html_near_start {
+=item _html_near_start()
+
+Look at a text scalar and determine whether it should be rendered
+as text/html. Based on a heuristic which simulates a certain common
+mail client.
+
+=cut
+
+sub _html_near_start {
my ($pad) = @_;
my $count = 0;
@@ -352,6 +361,19 @@
return ($count < 24);
}
+=item _render_text()
+
+_render_text() takes the given text/* type MIME part, and attempt
+to render it into a text scalar. It will always render text/html,
+and will use a heuristic to determine if other text/* parts should be
+considered text/html.
+
+Pass in the content-type and the decoded part array. Returns a scalar
+with the rendered data and the "rendered_as" content-type (same as passed
+in for no rendering, "text/html" for HTML).
+
+=cut
+
sub _render_text {
my ($type, $decoded) = @_;
@@ -361,16 +383,16 @@
# on a heuristic which simulates a certain common mail client
if ( $type =~ m@^text/html\b@i ||
($text =~ m/^(.{0,18}?<(?:$Mail::SpamAssassin::HTML::re_start)(?:\s.{0,18}?)?>)/ois &&
- html_near_start($1)
+ _html_near_start($1)
)
) {
my $html = Mail::SpamAssassin::HTML->new(); # object
my $html_rendered = $html->html_render($text); # rendered text
my $html_results = $html->get_results(); # needed in eval tests
- return join('', @{ $html_rendered });
+ return ( join('', @{ $html_rendered }), 'text/html' );
}
else {
- return $text;
+ return ( $text, $type );
}
}
Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm Tue Jan 13 22:12:43 2004
@@ -598,6 +598,33 @@
###########################################################################
+sub parse_content_type {
+ # This routine is typically called by passing a
+ # get_header("content-type") which passes all content-type headers
+ # (array context). If there are multiple Content-type headers (invalid,
+ # but it happens), MUAs seem to take the last one and so that's what we
+ # should do here.
+ #
+ my $ct = $_[-1] || 'text/plain; charset=us-ascii';
+
+ # This could be made a bit more rigid ...
+ # the actual ABNF, BTW:
+ # boundary := 0*69<bchars> bcharsnospace
+ # bchars := bcharsnospace / " "
+ # bcharsnospace := DIGIT / ALPHA / "'" / "(" / ")" / "+" /"_"
+ # / "," / "-" / "." / "/" / ":" / "=" / "?"
+ #
+ my ($boundary) = $ct =~ /boundary\s*=\s*["']?([^"';]+)["']?/i;
+
+ # Get the type out ...
+ $ct =~ s/;.*$//; # strip everything after first semi-colon
+ $ct =~ s@^([^/]+(?:/[^/]*)?).*$@$1@; # only something/something ...
+ $ct =~ tr!\000-\040\177-\377\042\050\051\054\056\072-\077\100\133-\135!!d; # strip inappropriate chars
+
+ return ($ct,$boundary);
+}
+
+###########################################################################
sub dbg { Mail::SpamAssassin::dbg (@_); }
1;