You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by fe...@apache.org on 2004/01/14 08:44:26 UTC
svn commit: rev 6169 - in incubator/spamassassin/trunk/lib/Mail/SpamAssassin: . MIME
Author: felicity
Date: Tue Jan 13 23:44:26 2004
New Revision: 6169
Modified:
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME.pm
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME/Parser.pm
Log:
More parser work. Cleaned up debug statements some, got rid of
some more code that is unneeded now that we do a tree as opposed to
a pseudo-tree, resolved a potential recursion issue with leaf nodes
pointing to themselves, simplified some code to use -1 for the last
element in an array instead of "scalar @array - 1", find_parts() will
now look at the tree structure to find children instead of just looking
for children under multipart/* nodes (it's the same thing, but now it's
not tied to the content-type, just the fact there are children nodes...)
Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME.pm Tue Jan 13 23:44:26 2004
@@ -97,8 +97,8 @@
if ( $self->{'type'} =~ /$re/ ) {
push(@ret, $self);
}
- elsif ( $self->{'type'} =~ m@^multipart/@i ) {
- # This object is a multipart container. Search all children.
+ elsif ( exists $self->{'body_parts'} ) {
+ # This object is a subtree root. Search all children.
foreach my $parts ( @{$self->{'body_parts'}} ) {
# Add the recursive results to our results
push(@ret, $parts->find_parts($re));
@@ -172,28 +172,6 @@
dbg("added part, type: ".$part->{'type'});
push @{ $self->{'body_parts'} }, $part;
-}
-
-sub body {
- my $self = shift;
- my $type = lc(shift);
- return unless @{ $self->{body_parts} };
- if ($type) {
-
- # warn("body has ", scalar(@{ $self->{body_parts} }), " [$type]\n");
- foreach my $body ( @{ $self->{body_parts} } ) {
-
- # warn("type: $body->[0]\n");
- if ( $type eq lc( $body->{type} ) ) {
- return $body;
- }
- }
- }
- else {
-
- # return first body part
- return $self->{body_parts}[0];
- }
}
sub dbg { Mail::SpamAssassin::dbg (@_); }
Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME/Parser.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME/Parser.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME/Parser.pm Tue Jan 13 23:44:26 2004
@@ -45,6 +45,8 @@
sub parse {
my($self,$message) = @_;
+ dbg("---- MIME PARSER START ----");
+
# protect it from abuse ...
local $_;
@@ -83,8 +85,13 @@
my ($boundary);
($msg->{'type'}, $boundary) = Mail::SpamAssassin::Util::parse_content_type($msg->header('content-type'));
+ dbg("main message type: ".$msg->{'type'});
+
+ # Make the tree
$self->_parse_body( $msg, $msg, $boundary, \@message, 1 );
+ dbg("---- MIME PARSER END ----");
+
return $msg;
}
@@ -131,12 +138,6 @@
# If it's not multipart, go ahead and just deal with it.
$self->_parse_normal( $msg, $_msg, $boundary, $body );
}
-
- if ( !$msg->body() ) {
- dbg("No message body found. Reparsing as blank.");
- my $part_msg = Mail::SpamAssassin::MIME->new();
- $self->_parse_normal( $msg, $part_msg, $boundary, [] );
- }
}
=item _parse_multipart()
@@ -167,23 +168,18 @@
# Else, there's no boundary, so leave the whole part...
}
- my $part_msg =
- Mail::SpamAssassin::MIME->new(); # just used for headers storage
+ my $part_msg = Mail::SpamAssassin::MIME->new(); # prepare a new tree node
my $in_body = 0;
-
my $header;
my $part_array;
my $line_count = @{$body};
foreach ( @{$body} ) {
+ # if we're on the last body line, or we find a boundary marker, deal with the mime part
if ( --$line_count == 0 || ($boundary && /^\-\-\Q$boundary\E/) ) {
+ my $line = $_; # remember the last line
- # end of part
- my $line = $_;
- chomp;
- dbg("Got end of MIME section: $_");
-
- # per rfc 1521, the CRLF before the boundary is part of the boundary ...
+ # per rfc 1521, the CRLF before the boundary is part of the boundary:
# NOTE: The CRLF preceding the encapsulation line is conceptually
# attached to the boundary so that it is possible to have a part
# that does not end with a CRLF (line break). Body parts that must
@@ -192,21 +188,24 @@
# of the preceding body part, and the second of which is part of the
# encapsulation boundary.
if ($part_array) {
- chomp( $part_array->[ scalar @{$part_array} - 1 ] );
- splice @{$part_array}, -1
- if ( $part_array->[ scalar @{$part_array} - 1 ] eq '' );
+ chomp( $part_array->[-1] ); # trim the CRLF that's part of the boundary
+ splice @{$part_array}, -1 if ( $part_array->[-1] eq '' ); # blank line for the boundary only ...
my($p_boundary);
($part_msg->{'type'}, $p_boundary) = Mail::SpamAssassin::Util::parse_content_type($part_msg->header('content-type'));
$p_boundary ||= $boundary;
+ dbg("found part of type ".$part_msg->{'type'}.", boundary: ".$p_boundary);
$self->_parse_body( $msg, $part_msg, $p_boundary, $part_array, 0 );
}
last if ($boundary && $line =~ /^\-\-\Q${boundary}\E\-\-$/);
+
+ # make sure we start with a new clean node
$in_body = 0;
$part_msg = Mail::SpamAssassin::MIME->new();
undef $part_array;
undef $header;
+
next;
}
@@ -247,12 +246,8 @@
sub _parse_normal {
my ($self, $msg, $part_msg, $boundary, $body) = @_;
- dbg("parsing normal".(defined $boundary ? ", got boundary: $boundary":""));
- delete $part_msg->{body_parts}; # single parts don't need a body_parts piece ...
-
- dbg("decoding attachment");
+ dbg("parsing normal, decoding attachment");
my ($type, $decoded, $name) = $self->_decode($part_msg, $body);
- dbg("decoded $type");
$part_msg->{'type'} = $type;
$part_msg->{'decoded'} = $decoded;
@@ -266,6 +261,14 @@
}
$msg->add_body_part($part_msg);
+
+ # now that we've added the leaf node, let's go ahead and kill
+ # body_parts (used for sub-trees). it could end up being recursive,
+ # and well, let's avoid that. ;)
+ #
+ # BTW: please leave this after add_body_parts() since it'll add it back.
+ #
+ delete $part_msg->{body_parts};
}
sub __decode_header {
@@ -317,15 +320,17 @@
($filename) = ( $type =~ /name="?([^\";]+)"?/i );
}
- if ( lc( $msg->header('content-transfer-encoding') ) eq 'quoted-printable' ) {
- dbg("decoding QP file");
+ my $encoding = lc $msg->header('content-transfer-encoding') || '';
+
+ if ( $encoding eq 'quoted-printable' ) {
+ dbg("decoding: quoted-printable");
my @output =
map { s/\r\n/\n/; $_; } split ( /^/m, Mail::SpamAssassin::Util::qp_decode( join ( "", @{$body} ) ) );
return $type, \@output, $filename;
}
- elsif ( lc( $msg->header('content-transfer-encoding') ) eq 'base64' ) {
- dbg("decoding B64 file");
+ elsif ( $encoding eq 'base64' ) {
+ dbg("decoding: base64");
# Generate the decoded output
my $output = [ Mail::SpamAssassin::Util::base64_decode(join("", @{$body})) ];
@@ -337,7 +342,12 @@
}
else {
# Encoding is one of 7bit, 8bit, binary or x-something
- dbg("decoding other encoding");
+ if ( $encoding ) {
+ dbg("decoding: other encoding type ($encoding), ignoring");
+ }
+ else {
+ dbg("decoding: no encoding detected");
+ }
# No encoding, so just point to the raw data ...
return $type, $body, $filename;