You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by fe...@apache.org on 2004/01/18 06:06:22 UTC

svn commit: rev 6197 - in incubator/spamassassin/trunk/lib/Mail/SpamAssassin: . MIME

Author: felicity
Date: Sat Jan 17 21:06:21 2004
New Revision: 6197

Modified:
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME.pm
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME/Parser.pm
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm
Log:
More parser work ...
- be a little smarter when parsing the boundary.  pay a little more attention to the rfc.
- do decode/render on-first-request.  this'll save time during parsing
  and memory until the need arises for a decoded/rendered part.
- generalized the text v. html difference rule to not know about internal data formats
- the parser will now deal with all of the message headers, not just mime-related ones
- decoding and rendering are now done in the object, not in the parser


Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm	Sat Jan 17 21:06:21 2004
@@ -3557,14 +3557,15 @@
   my($self) = @_;
 
   my @ma = $self->{msg}->{mime_parts}->find_parts(qr@^multipart/alternative\b@i);
+  my @content = $self->{msg}->{mime_parts}->content_summary();
 
   $self->{madiff} = 0;
 
   # Exchange meeting requests come in as m/a text/html text/calendar ...
   # Ignore any messages without a multipart/alternative section as well ...
-  if ( !@ma || (@ma == 1 && @{$ma[0]->{body_parts}} == 2 &&
-  		$ma[0]->{body_parts}->[0]->{type} =~ m@^text/html\b@i && 
-		$ma[0]->{body_parts}->[1]->{type} =~ m@^text/calendar\b@i) ) {
+  if ( !@ma || (@content == 3 && $content[2] eq 'text/calendar' &&
+  		$content[1] eq 'text/html' &&
+  		$content[0] eq 'multipart/alternative') ) {
     return;
   }
 
@@ -3575,9 +3576,9 @@
 
     my @txt = $part->find_parts(qr@^text\b@i);
     foreach my $text ( @txt ) {
-      my $rnd = $text->{'rendered'};
+      my($type, $rnd) = $text->rendered();
 
-      if ( $text->{'rendered_type'} =~ m@^text/html\b@i ) {
+      if ( $type eq 'text/html' ) {
         foreach my $w ( grep(/\w/,split(/\s+/,$rnd)) ) {
           next if ( $w =~ /^URI:/ );
           $html{$w}++;

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME.pm	Sat Jan 17 21:06:21 2004
@@ -174,6 +174,147 @@
   push @{ $self->{'body_parts'} }, $part;
 }
 
+=item _decode()
+
+Decode base64 and quoted-printable parts.
+
+=cut
+
+# TODO: accept a length param
+sub decode {
+  my($self, $bytes) = @_;
+
+  if ( !exists $self->{'decoded'} ) {
+    my $encoding = lc $self->header('content-transfer-encoding') || '';
+
+    if ( $encoding eq 'quoted-printable' ) {
+      dbg("decoding: quoted-printable");
+      $self->{'decoded'} = [
+        map { s/\r\n/\n/; $_; } split ( /^/m, Mail::SpamAssassin::Util::qp_decode( join ( "", @{$self->{'raw'}} ) ) )
+	];
+    }
+    elsif ( $encoding eq 'base64' ) {
+      dbg("decoding: base64");
+
+      # Generate the decoded output
+      $self->{'decoded'} = [ Mail::SpamAssassin::Util::base64_decode(join("", @{$self->{'raw'}})) ];
+
+      # If it's a type text or message, split it into an array of lines
+      if ( $self->{'type'} =~ m@^(?:text|message)\b/@i ) {
+        $self->{'decoded'} = [ map { s/\r\n/\n/; $_; } split(/^/m, $self->{'decoded'}->[0]) ];
+      }
+    }
+    else {
+      # Encoding is one of 7bit, 8bit, binary or x-something
+      if ( $encoding ) {
+        dbg("decoding: other encoding type ($encoding), ignoring");
+      }
+      else {
+        dbg("decoding: no encoding detected");
+      }
+      $self->{'decoded'} = $self->{'raw'};
+    }
+  }
+
+  if ( !defined $bytes || $bytes ) {
+    my $tmp = join("", @{$self->{'decoded'}});
+    if ( !defined $bytes ) {
+      return $tmp;
+    }
+    else {
+      return substr($tmp, 0, $bytes);
+    }
+  }
+}
+
+=item _html_near_start()
+
+Look at a text scalar and determine whether it should be rendered
+as text/html.  Based on a heuristic which simulates a certain common
+mail client.
+
+=cut
+
+sub _html_near_start {
+  my ($pad) = @_;
+
+  my $count = 0;
+  $count += ($pad =~ tr/\n//d) * 2;
+  $count += ($pad =~ tr/\n//cd);
+  return ($count < 24);
+}
+
+=item rendered()
+
+render_text() takes the given text/* type MIME part, and attempt
+to render it into a text scalar.  It will always render text/html,
+and will use a heuristic to determine if other text/* parts should be
+considered text/html.
+
+=cut
+
+sub rendered {
+  my ($self) = @_;
+
+  # We don't render anything except text
+  return(undef,undef) unless ( $self->{'type'} =~ /^text\b/i );
+
+  if ( !exists $self->{'rendered'} ) {
+    my $text = $self->decode();
+
+    # render text/html always, or any other text part as text/html based
+    # on a heuristic which simulates a certain common mail client
+    if ( $self->{'type'} =~ m@^text/html\b@i ||
+        ($text =~ m/^(.{0,18}?<(?:$Mail::SpamAssassin::HTML::re_start)(?:\s.{0,18}?)?>)/ois &&
+	  _html_near_start($1)
+        )
+       ) {
+      my $html = Mail::SpamAssassin::HTML->new();		# object
+      my $html_rendered = $html->html_render($text);	# rendered text
+      my $html_results = $html->get_results();		# needed in eval tests
+    
+      $self->{'rendered_type'} = 'text/html';
+      $self->{'rendered'} = join('', @{ $html_rendered });
+    }
+    else {
+      $self->{'rendered_type'} = $self->{'type'};
+      $self->{'rendered'} = $text;
+    }
+  }
+
+  return ($self->{'rendered_type'}, $self->{'rendered'});
+}
+
+# return an array with scalars describing mime parts
+sub content_summary {
+  my($self, $recurse) = @_;
+
+  # go recursive the first time through
+  $recurse = 1 unless ( defined $recurse );
+
+  # If this object matches, mark it for return.
+  if ( exists $self->{'body_parts'} ) {
+    my @ret = ();
+
+    # This object is a subtree root.  Search all children.
+    foreach my $parts ( @{$self->{'body_parts'}} ) {
+      # Add the recursive results to our results
+      my @p = $parts->content_summary(0);
+      if ( $recurse ) {
+        push(@ret, join(",", @p));
+      }
+      else {
+        push(@ret, @p);
+      }
+    }
+
+    return($self->{'type'}, @ret);
+  }
+  else {
+    return $self->{'type'};
+  }
+}
+
 sub dbg { Mail::SpamAssassin::dbg (@_); }
 
 1;

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME/Parser.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME/Parser.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MIME/Parser.pm	Sat Jan 17 21:06:21 2004
@@ -72,9 +72,7 @@
 
     if ($header) {
       my ( $key, $value ) = split ( /:\s*/, $header, 2 );
-      if ( $key =~ /^(?:MIME-Version|Lines|X-MIME|Content-)/i ) {
-        $msg->header( $key, $self->_decode_header($value), $value );
-      }
+      $msg->header( $key, $self->_decode_header($value), $value );
     }
 
     # not a continuation...
@@ -246,19 +244,18 @@
 sub _parse_normal {
   my ($self, $msg, $part_msg, $boundary, $body) = @_;
 
-  dbg("parsing normal, decoding attachment");
-  my ($type, $decoded, $name) = $self->_decode($part_msg, $body);
+  dbg("parsing normal part");
+
+  $part_msg->{'type'} =
+    Mail::SpamAssassin::Util::parse_content_type($part_msg->header('content-type'));
+
+  # attempt to figure out a name for this attachment if there is one ...
+  my $disp = $part_msg->header('content-disposition') || '';
+  my($filename) = $disp =~ /name="?([^\";]+)"?/i || $part_msg->{'type'} =~ /name="?([^\";]+)"?/i;
 
-  $part_msg->{'type'} = $type;
-  $part_msg->{'decoded'} = $decoded;
   $part_msg->{'raw'} = $body;
   $part_msg->{'boundary'} = $boundary;
-  $part_msg->{'name'} = $name if $name;
-
-  # If the message is a text/* type, then try rendering it...
-  if ( $type =~ /^text\b/i ) {
-    ($part_msg->{'rendered'}, $part_msg->{'rendered_type'}) = _render_text($type, $decoded);
-  }
+  $part_msg->{'name'} = $filename if $filename;
 
   $msg->add_body_part($part_msg);
 
@@ -302,108 +299,6 @@
   $header =~
     s/=\?([\w_-]+)\?([bqBQ])\?(.*?)\?=/__decode_header($1, uc($2), $3)/ge;
   return $header;
-}
-
-=item _decode()
-
-Decode base64 and quoted-printable parts.
-
-=cut
-
-sub _decode {
-  my($self, $msg, $body ) = @_;
-
-  my($type) = Mail::SpamAssassin::Util::parse_content_type($msg->header('content-type'));
-  my ($filename) =
-    ( $msg->header('content-disposition') =~ /name="?([^\";]+)"?/i );
-  if ( !$filename ) {
-    ($filename) = ( $type =~ /name="?([^\";]+)"?/i );
-  }
-
-  my $encoding = lc $msg->header('content-transfer-encoding') || '';
-
-  if ( $encoding eq 'quoted-printable' ) {
-    dbg("decoding: quoted-printable");
-    my @output =
-      map { s/\r\n/\n/; $_; } split ( /^/m, Mail::SpamAssassin::Util::qp_decode( join ( "", @{$body} ) ) );
-
-    return $type, \@output, $filename;
-  }
-  elsif ( $encoding eq 'base64' ) {
-    dbg("decoding: base64");
-
-    # Generate the decoded output
-    my $output = [ Mail::SpamAssassin::Util::base64_decode(join("", @{$body})) ];
-
-    # If it's a type text or message, split it into an array of lines
-    $output = [ map { s/\r\n/\n/; $_; } split(/^/m, $output->[0]) ] if ( $type =~ m@^(?:text|message)/@ );
-
-    return $type, $output, $filename;
-  }
-  else {
-    # Encoding is one of 7bit, 8bit, binary or x-something
-    if ( $encoding ) {
-      dbg("decoding: other encoding type ($encoding), ignoring");
-    }
-    else {
-      dbg("decoding: no encoding detected");
-    }
-
-    # No encoding, so just point to the raw data ...
-    return $type, $body, $filename;
-  }
-}
-
-=item _html_near_start()
-
-Look at a text scalar and determine whether it should be rendered
-as text/html.  Based on a heuristic which simulates a certain common
-mail client.
-
-=cut
-
-sub _html_near_start {
-  my ($pad) = @_;
-
-  my $count = 0;
-  $count += ($pad =~ tr/\n//d) * 2;
-  $count += ($pad =~ tr/\n//cd);
-  return ($count < 24);
-}
-
-=item _render_text()
-
-_render_text() takes the given text/* type MIME part, and attempt
-to render it into a text scalar.  It will always render text/html,
-and will use a heuristic to determine if other text/* parts should be
-considered text/html.
-
-Pass in the content-type and the decoded part array.  Returns a scalar
-with the rendered data and the "rendered_as" content-type (same as passed
-in for no rendering, "text/html" for HTML).
-
-=cut
-
-sub _render_text {
-  my ($type, $decoded) = @_;
-
-  my $text = join('', @{ $decoded });
-
-  # render text/html always, or any other text part as text/html based
-  # on a heuristic which simulates a certain common mail client
-  if ( $type =~ m@^text/html\b@i ||
-      ($text =~ m/^(.{0,18}?<(?:$Mail::SpamAssassin::HTML::re_start)(?:\s.{0,18}?)?>)/ois &&
-	_html_near_start($1)
-      )
-     ) {
-    my $html = Mail::SpamAssassin::HTML->new();		# object
-    my $html_rendered = $html->html_render($text);	# rendered text
-    my $html_results = $html->get_results();		# needed in eval tests
-    return ( join('', @{ $html_rendered }), 'text/html' );
-  }
-  else {
-    return ( $text, $type );
-  }
 }
 
 sub dbg { Mail::SpamAssassin::dbg (@_); }

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Util.pm	Sat Jan 17 21:06:21 2004
@@ -608,20 +608,28 @@
   my $ct = $_[-1] || 'text/plain; charset=us-ascii';
 
   # This could be made a bit more rigid ...
-  # the actual ABNF, BTW:
+  # the actual ABNF, BTW (RFC 1521, section 7.2.1):
   # boundary := 0*69<bchars> bcharsnospace
   # bchars := bcharsnospace / " "
   # bcharsnospace :=    DIGIT / ALPHA / "'" / "(" / ")" / "+" /"_"
   #               / "," / "-" / "." / "/" / ":" / "=" / "?"
   #
-  my ($boundary) = $ct =~ /boundary\s*=\s*["']?([^"';]+)["']?/i;
+  # The boundary may be surrounded by double quotes.
+  # "the boundary parameter, which consists of 1 to 70 characters from
+  # a set of characters known to be very robust through email gateways,
+  # and NOT ending with white space.  (If a boundary appears to end with
+  # white space, the white space must be presumed to have been added by
+  # a gateway, and must be deleted.)"
+  #
+  my ($boundary) = $ct =~ m!boundary\s*=\s*("[^"]*[^"\s]"|[^";\s]+)!i;
+  $boundary =~ tr/"//d if ( defined $boundary ); # remove the double quotes ...
 
   # Get the type out ...
   $ct =~ s/;.*$//;                    # strip everything after first semi-colon
   $ct =~ s@^([^/]+(?:/[^/]*)?).*$@$1@;	# only something/something ...
   $ct =~ tr!\000-\040\177-\377\042\050\051\054\056\072-\077\100\133-\135!!d;    # strip inappropriate chars
 
-  return ($ct,$boundary);
+  return wantarray ? ($ct,$boundary) : $ct;
 }
 
 ###########################################################################