You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by fe...@apache.org on 2004/02/18 03:41:35 UTC

svn commit: rev 6713 - in incubator/spamassassin/trunk: . lib/Mail lib/Mail/SpamAssassin masses spamd t

Author: felicity
Date: Tue Feb 17 18:41:34 2004
New Revision: 6713

Modified:
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin.pm
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/CmdLearn.pm
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgParser.pm
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgLearner.pm
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
   incubator/spamassassin/trunk/masses/mass-check
   incubator/spamassassin/trunk/spamassassin.raw
   incubator/spamassassin/trunk/spamd/spamd.raw
   incubator/spamassassin/trunk/t/bayesdbm.t
   incubator/spamassassin/trunk/t/bayessql.t
   incubator/spamassassin/trunk/t/mimeparse.t
   incubator/spamassassin/trunk/t/rule_tests.t
Log:
API CHANGE!  Moving from MsgParser->parse() to M::SA->parse(), so future changes to how we parse do not mean a change in the backend. Everything else is the same for now.

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin.pm	Tue Feb 17 18:41:34 2004
@@ -20,9 +20,8 @@
 
 =head1 SYNOPSIS
 
-  my $mail = Mail::SpamAssassin::MsgParser->parse();
-
   my $spamtest = Mail::SpamAssassin->new();
+  my $mail = $spamtest->parse();
   my $status = $spamtest->check ($mail);
 
   if ($status->is_spam ()) {
@@ -301,6 +300,100 @@
 
 ###########################################################################
 
+=item parse()
+
+Parse will return a Mail::SpamAssassin::MsgContainer object.  To use it,
+simply call C<Mail::SpamAssassin->parse($msg)>, where $msg is either undef
+(will use STDIN), a scalar of the entire message, an array reference
+of the message with 1 line per array element, or a file glob with the
+entire contents of the message.
+
+The procedure used to parse a message is recursive and ends up
+generating a tree of M::SA::MsgContainer objects.  parse() will generate
+the parent node of the tree, then pass the body of the message to
+M::SA::MsgParser->parse_body() which begins the recursive process.
+
+=cut
+
+sub parse {
+  my($self, $message) = @_;
+  $message ||= \*STDIN;
+
+  dbg("---- MIME PARSER START ----");
+
+  # protect it from abuse ...
+  local $_;
+
+  my @message;
+  if (ref $message eq 'ARRAY') {
+     @message = @{$message};
+  }
+  elsif (ref $message eq 'GLOB') {
+    if (defined fileno $message) {
+      @message = <$message>;
+    }
+  }
+  else {
+    @message = split ( /^/m, $message );
+  }
+
+  # Generate the main object and parse the appropriate MIME-related headers into it.
+  my $msg = Mail::SpamAssassin::MsgContainer->new();
+  my $header = '';
+  $msg->{'pristine_headers'} = '';
+
+  # Go through all the headers of the message
+  while ( my $last = shift @message ) {
+    # Store the non-modified headers in a scalar
+    $msg->{'pristine_headers'} .= $last;
+
+    if ( $last =~ /^From\s/ ) {
+      $msg->{'mbox_sep'} = $last;
+      next;
+    }
+
+    # NB: Really need to figure out special folding rules here!
+    if ( $last =~ /^[ \t]+/ ) {                    # if its a continuation
+      $header .= $last;                            # fold continuations
+      next;
+    }
+
+    # Ok, there's a header here, let's go ahead and add it in.
+    if ($header) {
+      my ( $key, $value ) = split ( /:\s*/, $header, 2 );
+      $msg->header( $key, $value );
+    }
+
+    # not a continuation...
+    $header = $last;
+
+    # Ok, we found the header/body blank line ...
+    last if ( $last =~ /^\r?$/m );
+  }
+
+  # Store the pristine body for later -- store as a copy since @message will get modified below
+  $msg->{'pristine_body'} = join('', @message);
+
+  # CRLF -> LF
+  for ( @message ) {
+    s/\r\n/\n/;
+  }
+
+  # Figure out the boundary
+  my ($boundary);
+  ($msg->{'type'}, $boundary) = Mail::SpamAssassin::Util::parse_content_type($msg->header('content-type'));
+  dbg("main message type: ".$msg->{'type'});
+
+  # Make the tree
+  Mail::SpamAssassin::MsgParser->parse_body( $msg, $msg, $boundary, \@message, 1 );
+
+  dbg("---- MIME PARSER END ----");
+
+  return $msg;
+}
+
+###########################################################################
+
 =item $f->trim_rules ($regexp)
 
 Remove all rules that don't match the given regexp (or are sub-rules of
@@ -600,8 +693,7 @@
 
 sub check_message_text {
   my $self = shift;
-  my @lines = split (/^/m, $_[0]);
-  my $mail_obj = Mail::SpamAssassin::MsgParser->parse (\@lines);
+  my $mail_obj = $self->parse (shift);
   return $self->check ($mail_obj);
 }
 
@@ -646,8 +738,7 @@
   $self->init(1);
 
   # Let's make sure the markup was removed first ...
-  my @msg = split (/^/m, $self->remove_spamassassin_markup($mail));
-  $mail = Mail::SpamAssassin::MsgParser->parse (\@msg);
+  $mail = $self->parse ($self->remove_spamassassin_markup($mail));
 
   # learn as spam if enabled
   if ( $self->{conf}->{bayes_learn_during_report} ) {
@@ -690,8 +781,7 @@
   $self->init(1);
 
   # Let's make sure the markup was removed first ...
-  my @msg = split (/^/m, $self->remove_spamassassin_markup($mail));
-  $mail = Mail::SpamAssassin::MsgParser->parse (\@msg);
+  $mail = $self->parse ($self->remove_spamassassin_markup($mail));
 
   # learn as nonspam
   $self->learn ($mail, undef, 0, 0);
@@ -1106,7 +1196,7 @@
   dbg ("ignore: test message to precompile patterns and load modules");
   $self->init($use_user_prefs);
 
-  my $mail = Mail::SpamAssassin::MsgParser->parse(\@testmsg);
+  my $mail = $self->parse(\@testmsg);
   my $status = Mail::SpamAssassin::PerMsgStatus->new($self, $mail,
                         { disable_auto_learning => 1 } );
   $status->word_is_in_dictionary("aba"); # load triplets.txt into memory
@@ -1153,7 +1243,7 @@
   $self->init(1);
   $self->{syntax_errors} += $self->{conf}->{errors};
 
-  my $mail = Mail::SpamAssassin::MsgParser->parse(\@testmsg);
+  my $mail = $self->parse(\@testmsg);
   my $status = Mail::SpamAssassin::PerMsgStatus->new($self, $mail,
                         { disable_auto_learning => 1 } );
   $status->check();

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/CmdLearn.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/CmdLearn.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/CmdLearn.pm	Tue Feb 17 18:41:34 2004
@@ -21,7 +21,7 @@
 
 use Mail::SpamAssassin;
 use Mail::SpamAssassin::ArchiveIterator;
-use Mail::SpamAssassin::MsgParser;
+use Mail::SpamAssassin::MsgContainer;
 use Mail::SpamAssassin::PerMsgLearner;
 
 use Getopt::Long;
@@ -317,13 +317,13 @@
 					{ die 'HITLIMIT'; }
 
   $messagecount++;
-  my $ma = Mail::SpamAssassin::MsgParser->parse ($dataref);
+  my $ma = Mail::SpamAssassin->parse ($dataref);
 
   if ($ma->get_header ("X-Spam-Checker-Version")) {
     my $newtext = $spamtest->remove_spamassassin_markup($ma);
     my @newtext = split (/^/m, $newtext);
     $dataref = \@newtext;
-    $ma = Mail::SpamAssassin::MsgParser->parse ($dataref);
+    $ma = Mail::SpamAssassin->parse ($dataref);
   }
 
   my $status = $spamtest->learn ($ma, undef, $isspam, $forget);

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgParser.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgParser.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgParser.pm	Tue Feb 17 18:41:34 2004
@@ -43,117 +43,22 @@
 
 use constant MAX_BODY_LINE_LENGTH =>        2048;
 
-=item parse()
+=item parse_body()
 
-Unlike most modules, Mail::SpamAssassin::MsgParser will not return
-an object of the same type, but rather a Mail::SpamAssassin::MsgContainer
-object.  To use it, simply call
-C<Mail::SpamAssassin::MsgParser->parse($msg)>, where $msg is either
-a scalar, an array reference, or a glob, with the entire contents
-of the mesage.
-
-The procedure used to parse a message is recursive and ends up generating
-a tree of M::SA::MsgContainer objects.  parse() will generate the parent node
-of the tree, then pass the body of the message to _parse_body() which begins
-the recursive process.
-
-=cut
-
-sub parse {
-  my($self,$message) = @_;
-  $message ||= \*STDIN;
-
-  dbg("---- MIME PARSER START ----");
-
-  # protect it from abuse ...
-  local $_;
-
-  my @message;
-  if (ref $message eq 'ARRAY') {
-     @message = @{$message};
-  }
-  elsif (ref $message eq 'GLOB') {
-    if (defined fileno $message) {
-      @message = <$message>;
-    }
-  }
-  else {
-    @message = split ( /^/m, $message );
-  }
-
-  # Generate the main object and parse the appropriate MIME-related headers into it.
-  my $msg = Mail::SpamAssassin::MsgContainer->new();
-  my $header = '';
-  $msg->{'pristine_headers'} = '';
-
-  # Go through all the headers of the message
-  while ( my $last = shift @message ) {
-    # Store the non-modified headers in a scalar
-    $msg->{'pristine_headers'} .= $last;
-
-    if ( $last =~ /^From\s/ ) {
-      $msg->{'mbox_sep'} = $last;
-      next;
-    }
-
-    # NB: Really need to figure out special folding rules here!
-    if ( $last =~ /^[ \t]+/ ) {                    # if its a continuation
-      $header .= $last;                            # fold continuations
-      next;
-    }
-
-    # Ok, there's a header here, let's go ahead and add it in.
-    if ($header) {
-      my ( $key, $value ) = split ( /:\s*/, $header, 2 );
-      $msg->header( $key, $value );
-    }
-
-    # not a continuation...
-    $header = $last;
-
-    # Ok, we found the header/body blank line ...
-    last if ( $last =~ /^\r?$/m );
-  }
-
-  # Store the pristine body for later -- store as a copy since @message will get modified below
-  $msg->{'pristine_body'} = join('', @message);
-
-  # CRLF -> LF
-  for ( @message ) {
-    s/\r\n/\n/;
-  }
-
-  # Figure out the boundary
-  my ($boundary);
-  ($msg->{'type'}, $boundary) = Mail::SpamAssassin::Util::parse_content_type($msg->header('content-type'));
-  dbg("main message type: ".$msg->{'type'});
-
-  # Make the tree
-  $self->_parse_body( $msg, $msg, $boundary, \@message, 1 );
-
-  dbg("---- MIME PARSER END ----");
-
-  return $msg;
-}
-
-=head1 NON-PUBLIC METHODS
-
-=item _parse_body()
-
-_parse_body() passes the body part that was passed in onto the
+parse_body() passes the body part that was passed in onto the
 correct part parser, either _parse_multipart() for multipart/* parts,
 or _parse_normal() for everything else.  Multipart sections become the
 root of sub-trees, while everything else becomes a leaf in the tree.
 
-For multipart messages, the first call to _parse_body() doesn't create a
+For multipart messages, the first call to parse_body() doesn't create a
 new sub-tree and just uses the parent node to contain children.  All other
-calls to _parse_body() will cause a new sub-tree root to be created and
+calls to parse_body() will cause a new sub-tree root to be created and
 children will exist underneath that root.  (this is just so the tree
 doesn't have a root node which points at the actual root node ...)
 
 =cut
 
-sub _parse_body {
+sub parse_body {
   my($self, $msg, $_msg, $boundary, $body, $initial) = @_;
 
   # Figure out the simple content-type, or set it to text/plain
@@ -182,9 +87,11 @@
   }
 }
 
+=head1 NON-PUBLIC METHODS
+
 =item _parse_multipart()
 
-Generate a root node, and for each child part call _parse_body()
+Generate a root node, and for each child part call parse_body()
 to generate the tree.
 
 =cut
@@ -237,7 +144,7 @@
 	($part_msg->{'type'}, $p_boundary) = Mail::SpamAssassin::Util::parse_content_type($part_msg->header('content-type'));
         $p_boundary ||= $boundary;
 	dbg("found part of type ".$part_msg->{'type'}.", boundary: ".(defined $p_boundary ? $p_boundary : ''));
-        $self->_parse_body( $msg, $part_msg, $p_boundary, $part_array, 0 );
+        $self->parse_body( $msg, $part_msg, $p_boundary, $part_array, 0 );
       }
 
       last if (defined $boundary && $line =~ /^\-\-\Q${boundary}\E\-\-$/);

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgLearner.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgLearner.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgLearner.pm	Tue Feb 17 18:41:34 2004
@@ -24,7 +24,7 @@
     'rules_filename'      => '/etc/spamassassin.rules',
     'userprefs_filename'  => $ENV{HOME}.'/.spamassassin.cf'
   });
-  my $mail = Mail::SpamAssassin::MsgParser->parse();
+  my $mail = $spamtest->parse();
 
   my $status = $spamtest->learn ($mail);
   ...

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm	Tue Feb 17 18:41:34 2004
@@ -24,7 +24,7 @@
     'rules_filename'      => '/etc/spamassassin.rules',
     'userprefs_filename'  => $ENV{HOME}.'/.spamassassin.cf'
   });
-  my $mail = Mail::SpamAssassin::MsgParser->parse();
+  my $mail = $spamtest->parse();
 
   my $status = $spamtest->check ($mail);
   if ($status->is_spam()) {
@@ -57,7 +57,7 @@
 use Mail::SpamAssassin::Conf;
 use Mail::SpamAssassin::Received;
 use Mail::SpamAssassin::Util;
-use Mail::SpamAssassin::MsgParser;
+use Mail::SpamAssassin::MsgContainer;
 
 use constant MAX_BODY_LINE_LENGTH =>        2048;
 

Modified: incubator/spamassassin/trunk/masses/mass-check
==============================================================================
--- incubator/spamassassin/trunk/masses/mass-check	(original)
+++ incubator/spamassassin/trunk/masses/mass-check	Tue Feb 17 18:41:34 2004
@@ -77,7 +77,7 @@
 eval "use bytes";
 use Mail::SpamAssassin::ArchiveIterator;
 use Mail::SpamAssassin;
-use Mail::SpamAssassin::MsgParser;
+use Mail::SpamAssassin::MsgContainer;
 use Getopt::Long;
 use POSIX qw(strftime);
 use constant HAS_TIME_PARSEDATE => eval { require Time::ParseDate; };
@@ -242,12 +242,12 @@
   my ($id, $time, $dataref) = @_;
   my $out;
 
-  my $ma = Mail::SpamAssassin::MsgParser->parse($dataref);
+  my $ma = $spamtest->parse($dataref);
 
   # remove SpamAssassin markup, if present and the mail was spam
   $_ = $ma->get_header ("X-Spam-Status");
   if (defined($_) && /^Yes, hits=/) {
-    $ma = Mail::SpamAssassin::MsgParser->parse ($spamtest->remove_spamassassin_markup($ma));
+    $ma = $spamtest->parse ($spamtest->remove_spamassassin_markup($ma));
   }
 
   my $status = $spamtest->check($ma);

Modified: incubator/spamassassin/trunk/spamassassin.raw
==============================================================================
--- incubator/spamassassin/trunk/spamassassin.raw	(original)
+++ incubator/spamassassin/trunk/spamassassin.raw	Tue Feb 17 18:41:34 2004
@@ -80,7 +80,7 @@
 
 eval {
   require Mail::SpamAssassin;
-  require Mail::SpamAssassin::MsgParser;
+  require Mail::SpamAssassin::MsgContainer;
 
   # gnu_getopt is not available in Getopt::Long 2.24, see bug 732
   # gnu_compat neither.
@@ -168,7 +168,7 @@
   # incoming message
   #
   if (!$doing_address_only_whitelisting) {
-    $mail = Mail::SpamAssassin::MsgParser->parse ();
+    $mail = $spamtest->parse ();
   }
 
 # handle removing reports
@@ -183,7 +183,7 @@
       # go ahead and remove the markup, then fake that the clean version
       # was what was sent in
       #
-      $mail = Mail::SpamAssassin::MsgParser->parse ($spamtest->remove_spamassassin_markup ($mail));
+      $mail = $spamtest->parse ($spamtest->remove_spamassassin_markup ($mail));
     }
   }
 

Modified: incubator/spamassassin/trunk/spamd/spamd.raw
==============================================================================
--- incubator/spamassassin/trunk/spamd/spamd.raw	(original)
+++ incubator/spamassassin/trunk/spamd/spamd.raw	Tue Feb 17 18:41:34 2004
@@ -38,7 +38,7 @@
 use IO::Pipe;
 
 use Mail::SpamAssassin;
-use Mail::SpamAssassin::MsgParser;
+use Mail::SpamAssassin::MsgContainer;
 use Mail::SpamAssassin::NetSet;
 
 use Getopt::Long;
@@ -888,7 +888,7 @@
       "."
     );
 
-    my $mail = Mail::SpamAssassin::MsgParser->parse (\@msglines);
+    my $mail = $spamtest->parse (\@msglines);
 
     # Check length if we're supposed to
     if($expected_length && ($actual_length != $expected_length)) {

Modified: incubator/spamassassin/trunk/t/bayesdbm.t
==============================================================================
--- incubator/spamassassin/trunk/t/bayesdbm.t	(original)
+++ incubator/spamassassin/trunk/t/bayesdbm.t	Tue Feb 17 18:41:34 2004
@@ -26,7 +26,7 @@
 ");
 
 use Mail::SpamAssassin;
-use Mail::SpamAssassin::MsgParser;
+use Mail::SpamAssassin::MsgContainer;
 
 my $sa = create_saobj();
 
@@ -54,7 +54,7 @@
   push(@msg, $line);
 }
 
-my $mail = Mail::SpamAssassin::MsgParser->parse( \@msg );
+my $mail = $sa->parse( \@msg );
 
 ok($mail);
 
@@ -215,7 +215,7 @@
   push(@msg, $line);
 }
 
-$mail = Mail::SpamAssassin::MsgParser->parse( \@msg );
+$mail = $sa->parse( \@msg );
 
 $body = $sa->{bayes_scanner}->get_body_from_msg($mail);
 
@@ -246,7 +246,7 @@
   push(@msg, $line);
 }
 
-$mail = Mail::SpamAssassin::MsgParser->parse( \@msg );
+$mail = $sa->parse( \@msg );
 
 $body = $sa->{bayes_scanner}->get_body_from_msg($mail);
 

Modified: incubator/spamassassin/trunk/t/bayessql.t
==============================================================================
--- incubator/spamassassin/trunk/t/bayessql.t	(original)
+++ incubator/spamassassin/trunk/t/bayessql.t	Tue Feb 17 18:41:34 2004
@@ -59,7 +59,7 @@
 ");
 
 use Mail::SpamAssassin;
-use Mail::SpamAssassin::MsgParser;
+use Mail::SpamAssassin::MsgContainer;
 
 my $sa = create_saobj();
 
@@ -87,7 +87,7 @@
   push(@msg, $line);
 }
 
-my $mail = Mail::SpamAssassin::MsgParser->parse( \@msg );
+my $mail = $sa->parse( \@msg );
 
 ok($mail);
 
@@ -220,7 +220,7 @@
   push(@msg, $line);
 }
 
-$mail = Mail::SpamAssassin::MsgParser->parse( \@msg );
+$mail = $sa->parse( \@msg );
 
 $body = $sa->{bayes_scanner}->get_body_from_msg($mail);
 
@@ -250,7 +250,7 @@
   push(@msg, $line);
 }
 
-$mail = Mail::SpamAssassin::MsgParser->parse( \@msg );
+$mail = $sa->parse( \@msg );
 
 $body = $sa->{bayes_scanner}->get_body_from_msg($mail);
 

Modified: incubator/spamassassin/trunk/t/mimeparse.t
==============================================================================
--- incubator/spamassassin/trunk/t/mimeparse.t	(original)
+++ incubator/spamassassin/trunk/t/mimeparse.t	Tue Feb 17 18:41:34 2004
@@ -17,7 +17,7 @@
 
 use strict;
 use Test;
-use Mail::SpamAssassin::MsgParser;
+use Mail::SpamAssassin::MsgContainer;
 use Mail::SpamAssassin::SHA1;
 
 my %files = (
@@ -88,7 +88,7 @@
 
 foreach my $k ( sort keys %files ) {
   open(INP, $k) || die "Can't find $k:$!";
-  my $mail = Mail::SpamAssassin::MsgParser->parse(\*INP);
+  my $mail = Mail::SpamAssassin->parse(\*INP);
   close(INP);
   my $res = join("\n",$mail->content_summary());
   #print "---\n$res\n---\n";

Modified: incubator/spamassassin/trunk/t/rule_tests.t
==============================================================================
--- incubator/spamassassin/trunk/t/rule_tests.t	(original)
+++ incubator/spamassassin/trunk/t/rule_tests.t	Tue Feb 17 18:41:34 2004
@@ -18,7 +18,7 @@
 use strict;
 use Test;
 use Mail::SpamAssassin;
-use Mail::SpamAssassin::MsgParser;
+use Mail::SpamAssassin::MsgContainer;
 use vars qw($num_tests);
 
 $num_tests = 1;
@@ -62,7 +62,7 @@
             my $test_string = $sa->{conf}->{head_tests}->{$symbol} || $sa->{conf}->{head_evals}->{$symbol};
             my ($header_name) = $test_string =~ /^(\S+)/;
             # warn("got header name: $header_name - setting to: $string\n");
-	    $mail = Mail::SpamAssassin::MsgParser->parse(["${header_name}: $string\n","\n","\n"]);
+	    $mail = $sa->parse(["${header_name}: $string\n","\n","\n"]);
         }
         else {
             # warn("setting body: $string\n");
@@ -73,7 +73,7 @@
 	    if ( $string =~ /<[^>]*>/ ) {
 	      $type = "text/html";
 	    }
-	    $mail = Mail::SpamAssassin::MsgParser->parse(["Content-type: $type\n","\n","$string\n"]);
+	    $mail = $sa->parse(["Content-type: $type\n","\n","$string\n"]);
         }
 
         my $msg = Mail::SpamAssassin::PerMsgStatus->new($sa, $mail);