You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by fe...@apache.org on 2004/01/18 06:06:22 UTC
svn commit: rev 6197 - in incubator/spamassassin/trunk/lib/Mail/SpamAssassin: . MIME
Author: felicity
Date: Sat Jan 17 21:06:21 2004
New Revision: 6197
Modified:
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME.pm
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME/Parser.pm
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm
Log:
More parser work ...
- be a little smarter when parsing the boundary. pay a little more attention to the rfc.
- do decode/render on-first-request. this'll save time during parsing
and memory until the need arises for a decoded/rendered part.
- generalized the text v. html difference rule to not know about internal data formats
- the parser will now deal with all of the message headers, not just mime-related ones
- decoding and rendering are now done in the object, not in the parser
Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm Sat Jan 17 21:06:21 2004
@@ -3557,14 +3557,15 @@
my($self) = @_;
my @ma = $self->{msg}->{mime_parts}->find_parts(qr@^multipart/alternative\b@i);
+ my @content = $self->{msg}->{mime_parts}->content_summary();
$self->{madiff} = 0;
# Exchange meeting requests come in as m/a text/html text/calendar ...
# Ignore any messages without a multipart/alternative section as well ...
- if ( !@ma || (@ma == 1 && @{$ma[0]->{body_parts}} == 2 &&
- $ma[0]->{body_parts}->[0]->{type} =~ m@^text/html\b@i &&
- $ma[0]->{body_parts}->[1]->{type} =~ m@^text/calendar\b@i) ) {
+ if ( !@ma || (@content == 3 && $content[2] eq 'text/calendar' &&
+ $content[1] eq 'text/html' &&
+ $content[0] eq 'multipart/alternative') ) {
return;
}
@@ -3575,9 +3576,9 @@
my @txt = $part->find_parts(qr@^text\b@i);
foreach my $text ( @txt ) {
- my $rnd = $text->{'rendered'};
+ my($type, $rnd) = $text->rendered();
- if ( $text->{'rendered_type'} =~ m@^text/html\b@i ) {
+ if ( $type eq 'text/html' ) {
foreach my $w ( grep(/\w/,split(/\s+/,$rnd)) ) {
next if ( $w =~ /^URI:/ );
$html{$w}++;
Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME.pm Sat Jan 17 21:06:21 2004
@@ -174,6 +174,147 @@
push @{ $self->{'body_parts'} }, $part;
}
+=item _decode()
+
+Decode base64 and quoted-printable parts.
+
+=cut
+
+# TODO: accept a length param
+sub decode {
+ my($self, $bytes) = @_;
+
+ if ( !exists $self->{'decoded'} ) {
+ my $encoding = lc $self->header('content-transfer-encoding') || '';
+
+ if ( $encoding eq 'quoted-printable' ) {
+ dbg("decoding: quoted-printable");
+ $self->{'decoded'} = [
+ map { s/\r\n/\n/; $_; } split ( /^/m, Mail::SpamAssassin::Util::qp_decode( join ( "", @{$self->{'raw'}} ) ) )
+ ];
+ }
+ elsif ( $encoding eq 'base64' ) {
+ dbg("decoding: base64");
+
+ # Generate the decoded output
+ $self->{'decoded'} = [ Mail::SpamAssassin::Util::base64_decode(join("", @{$self->{'raw'}})) ];
+
+ # If it's a type text or message, split it into an array of lines
+ if ( $self->{'type'} =~ m@^(?:text|message)\b/@i ) {
+ $self->{'decoded'} = [ map { s/\r\n/\n/; $_; } split(/^/m, $self->{'decoded'}->[0]) ];
+ }
+ }
+ else {
+ # Encoding is one of 7bit, 8bit, binary or x-something
+ if ( $encoding ) {
+ dbg("decoding: other encoding type ($encoding), ignoring");
+ }
+ else {
+ dbg("decoding: no encoding detected");
+ }
+ $self->{'decoded'} = $self->{'raw'};
+ }
+ }
+
+ if ( !defined $bytes || $bytes ) {
+ my $tmp = join("", @{$self->{'decoded'}});
+ if ( !defined $bytes ) {
+ return $tmp;
+ }
+ else {
+ return substr($tmp, 0, $bytes);
+ }
+ }
+}
+
+=item _html_near_start()
+
+Look at a text scalar and determine whether it should be rendered
+as text/html. Based on a heuristic which simulates a certain common
+mail client.
+
+=cut
+
+sub _html_near_start {
+ my ($pad) = @_;
+
+ my $count = 0;
+ $count += ($pad =~ tr/\n//d) * 2;
+ $count += ($pad =~ tr/\n//cd);
+ return ($count < 24);
+}
+
+=item rendered()
+
+render_text() takes the given text/* type MIME part, and attempt
+to render it into a text scalar. It will always render text/html,
+and will use a heuristic to determine if other text/* parts should be
+considered text/html.
+
+=cut
+
+sub rendered {
+ my ($self) = @_;
+
+ # We don't render anything except text
+ return(undef,undef) unless ( $self->{'type'} =~ /^text\b/i );
+
+ if ( !exists $self->{'rendered'} ) {
+ my $text = $self->decode();
+
+ # render text/html always, or any other text part as text/html based
+ # on a heuristic which simulates a certain common mail client
+ if ( $self->{'type'} =~ m@^text/html\b@i ||
+ ($text =~ m/^(.{0,18}?<(?:$Mail::SpamAssassin::HTML::re_start)(?:\s.{0,18}?)?>)/ois &&
+ _html_near_start($1)
+ )
+ ) {
+ my $html = Mail::SpamAssassin::HTML->new(); # object
+ my $html_rendered = $html->html_render($text); # rendered text
+ my $html_results = $html->get_results(); # needed in eval tests
+
+ $self->{'rendered_type'} = 'text/html';
+ $self->{'rendered'} = join('', @{ $html_rendered });
+ }
+ else {
+ $self->{'rendered_type'} = $self->{'type'};
+ $self->{'rendered'} = $text;
+ }
+ }
+
+ return ($self->{'rendered_type'}, $self->{'rendered'});
+}
+
+# return an array with scalars describing mime parts
+sub content_summary {
+ my($self, $recurse) = @_;
+
+ # go recursive the first time through
+ $recurse = 1 unless ( defined $recurse );
+
+ # If this object matches, mark it for return.
+ if ( exists $self->{'body_parts'} ) {
+ my @ret = ();
+
+ # This object is a subtree root. Search all children.
+ foreach my $parts ( @{$self->{'body_parts'}} ) {
+ # Add the recursive results to our results
+ my @p = $parts->content_summary(0);
+ if ( $recurse ) {
+ push(@ret, join(",", @p));
+ }
+ else {
+ push(@ret, @p);
+ }
+ }
+
+ return($self->{'type'}, @ret);
+ }
+ else {
+ return $self->{'type'};
+ }
+}
+
sub dbg { Mail::SpamAssassin::dbg (@_); }
1;
Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME/Parser.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME/Parser.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME/Parser.pm Sat Jan 17 21:06:21 2004
@@ -72,9 +72,7 @@
if ($header) {
my ( $key, $value ) = split ( /:\s*/, $header, 2 );
- if ( $key =~ /^(?:MIME-Version|Lines|X-MIME|Content-)/i ) {
- $msg->header( $key, $self->_decode_header($value), $value );
- }
+ $msg->header( $key, $self->_decode_header($value), $value );
}
# not a continuation...
@@ -246,19 +244,18 @@
sub _parse_normal {
my ($self, $msg, $part_msg, $boundary, $body) = @_;
- dbg("parsing normal, decoding attachment");
- my ($type, $decoded, $name) = $self->_decode($part_msg, $body);
+ dbg("parsing normal part");
+
+ $part_msg->{'type'} =
+ Mail::SpamAssassin::Util::parse_content_type($part_msg->header('content-type'));
+
+ # attempt to figure out a name for this attachment if there is one ...
+ my $disp = $part_msg->header('content-disposition') || '';
+ my($filename) = $disp =~ /name="?([^\";]+)"?/i || $part_msg->{'type'} =~ /name="?([^\";]+)"?/i;
- $part_msg->{'type'} = $type;
- $part_msg->{'decoded'} = $decoded;
$part_msg->{'raw'} = $body;
$part_msg->{'boundary'} = $boundary;
- $part_msg->{'name'} = $name if $name;
-
- # If the message is a text/* type, then try rendering it...
- if ( $type =~ /^text\b/i ) {
- ($part_msg->{'rendered'}, $part_msg->{'rendered_type'}) = _render_text($type, $decoded);
- }
+ $part_msg->{'name'} = $filename if $filename;
$msg->add_body_part($part_msg);
@@ -302,108 +299,6 @@
$header =~
s/=\?([\w_-]+)\?([bqBQ])\?(.*?)\?=/__decode_header($1, uc($2), $3)/ge;
return $header;
-}
-
-=item _decode()
-
-Decode base64 and quoted-printable parts.
-
-=cut
-
-sub _decode {
- my($self, $msg, $body ) = @_;
-
- my($type) = Mail::SpamAssassin::Util::parse_content_type($msg->header('content-type'));
- my ($filename) =
- ( $msg->header('content-disposition') =~ /name="?([^\";]+)"?/i );
- if ( !$filename ) {
- ($filename) = ( $type =~ /name="?([^\";]+)"?/i );
- }
-
- my $encoding = lc $msg->header('content-transfer-encoding') || '';
-
- if ( $encoding eq 'quoted-printable' ) {
- dbg("decoding: quoted-printable");
- my @output =
- map { s/\r\n/\n/; $_; } split ( /^/m, Mail::SpamAssassin::Util::qp_decode( join ( "", @{$body} ) ) );
-
- return $type, \@output, $filename;
- }
- elsif ( $encoding eq 'base64' ) {
- dbg("decoding: base64");
-
- # Generate the decoded output
- my $output = [ Mail::SpamAssassin::Util::base64_decode(join("", @{$body})) ];
-
- # If it's a type text or message, split it into an array of lines
- $output = [ map { s/\r\n/\n/; $_; } split(/^/m, $output->[0]) ] if ( $type =~ m@^(?:text|message)/@ );
-
- return $type, $output, $filename;
- }
- else {
- # Encoding is one of 7bit, 8bit, binary or x-something
- if ( $encoding ) {
- dbg("decoding: other encoding type ($encoding), ignoring");
- }
- else {
- dbg("decoding: no encoding detected");
- }
-
- # No encoding, so just point to the raw data ...
- return $type, $body, $filename;
- }
-}
-
-=item _html_near_start()
-
-Look at a text scalar and determine whether it should be rendered
-as text/html. Based on a heuristic which simulates a certain common
-mail client.
-
-=cut
-
-sub _html_near_start {
- my ($pad) = @_;
-
- my $count = 0;
- $count += ($pad =~ tr/\n//d) * 2;
- $count += ($pad =~ tr/\n//cd);
- return ($count < 24);
-}
-
-=item _render_text()
-
-_render_text() takes the given text/* type MIME part, and attempt
-to render it into a text scalar. It will always render text/html,
-and will use a heuristic to determine if other text/* parts should be
-considered text/html.
-
-Pass in the content-type and the decoded part array. Returns a scalar
-with the rendered data and the "rendered_as" content-type (same as passed
-in for no rendering, "text/html" for HTML).
-
-=cut
-
-sub _render_text {
- my ($type, $decoded) = @_;
-
- my $text = join('', @{ $decoded });
-
- # render text/html always, or any other text part as text/html based
- # on a heuristic which simulates a certain common mail client
- if ( $type =~ m@^text/html\b@i ||
- ($text =~ m/^(.{0,18}?<(?:$Mail::SpamAssassin::HTML::re_start)(?:\s.{0,18}?)?>)/ois &&
- _html_near_start($1)
- )
- ) {
- my $html = Mail::SpamAssassin::HTML->new(); # object
- my $html_rendered = $html->html_render($text); # rendered text
- my $html_results = $html->get_results(); # needed in eval tests
- return ( join('', @{ $html_rendered }), 'text/html' );
- }
- else {
- return ( $text, $type );
- }
}
sub dbg { Mail::SpamAssassin::dbg (@_); }
Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm Sat Jan 17 21:06:21 2004
@@ -608,20 +608,28 @@
my $ct = $_[-1] || 'text/plain; charset=us-ascii';
# This could be made a bit more rigid ...
- # the actual ABNF, BTW:
+ # the actual ABNF, BTW (RFC 1521, section 7.2.1):
# boundary := 0*69<bchars> bcharsnospace
# bchars := bcharsnospace / " "
# bcharsnospace := DIGIT / ALPHA / "'" / "(" / ")" / "+" /"_"
# / "," / "-" / "." / "/" / ":" / "=" / "?"
#
- my ($boundary) = $ct =~ /boundary\s*=\s*["']?([^"';]+)["']?/i;
+ # The boundary may be surrounded by double quotes.
+ # "the boundary parameter, which consists of 1 to 70 characters from
+ # a set of characters known to be very robust through email gateways,
+ # and NOT ending with white space. (If a boundary appears to end with
+ # white space, the white space must be presumed to have been added by
+ # a gateway, and must be deleted.)"
+ #
+ my ($boundary) = $ct =~ m!boundary\s*=\s*("[^"]*[^"\s]"|[^";\s]+)!i;
+ $boundary =~ tr/"//d if ( defined $boundary ); # remove the double quotes ...
# Get the type out ...
$ct =~ s/;.*$//; # strip everything after first semi-colon
$ct =~ s@^([^/]+(?:/[^/]*)?).*$@$1@; # only something/something ...
$ct =~ tr!\000-\040\177-\377\042\050\051\054\056\072-\077\100\133-\135!!d; # strip inappropriate chars
- return ($ct,$boundary);
+ return wantarray ? ($ct,$boundary) : $ct;
}
###########################################################################