You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2004/02/19 19:41:48 UTC

svn commit: rev 6779 - in incubator/spamassassin/trunk/lib/Mail: . SpamAssassin

Author: jm
Date: Thu Feb 19 10:41:47 2004
New Revision: 6779

Modified:
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin.pm
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgMetadata.pm
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
Log:
some cleanup of last night's metadata code

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin.pm	Thu Feb 19 10:41:47 2004
@@ -348,6 +348,11 @@
   my $header = '';
   $msg->{'pristine_headers'} = '';
 
+  # inform the node that it's a message root, so that it knows that
+  # it can have stuff that only root nodes have.  TODO: IMO, we should
+  # probably just have a subclass of MsgContainer for root nodes!
+  $msg->_set_is_root();
+
   # Go through all the headers of the message
   while ( my $last = shift @message ) {
     # Store the non-modified headers in a scalar

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/EvalTests.pm	Thu Feb 19 10:41:47 2004
@@ -2268,17 +2268,18 @@
   }
 
   # map of languages that are very often mistaken for another, perhaps with
-  # more than 0.02% false positives
+  # more than 0.02% false positives.  only used for text < 2048 bytes in
+  # length
   my %mistakable = ('sco' => 'en');
 
   # see if any matches are okay
   foreach my $match (@matches) {
     $match =~ s/\..*//;
-    if (exists $mistakable{$match}) {
+    if ($self->{languages_body_len} < 2048 && exists $mistakable{$match}) {
       $match = $mistakable{$match};
     }
     foreach my $language (@languages) {
-      if (exists $mistakable{$language}) {
+      if ($self->{languages_body_len} < 2048 && exists $mistakable{$language}) {
 	$language = $mistakable{$language};
       }
       if ($match eq $language) {

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgContainer.pm	Thu Feb 19 10:41:47 2004
@@ -56,7 +56,6 @@
   my $self = {
     headers		=> {},
     raw_headers		=> {},
-    meta_strings	=> {},
     body_parts		=> [],
     header_order	=> [],
     already_parsed	=> 1,
@@ -72,6 +71,23 @@
   $self;
 }
 
+=item _set_is_root()
+
+Non-Public function to inform this node that it's the root, and
+can hold stuff that only a root should do.
+
+(TODO: IMO, we should just have a subclass of MsgContainer for
+root nodes.)
+
+=cut
+
+sub _set_is_root {
+  my($self) = @_;
+
+  # create the metadata holder class
+  $self->{metadata} = Mail::SpamAssassin::MsgMetadata->new($self);
+}
+
 =item _do_parse()
 
 Non-Public function which will initiate a MIME part part (generates
@@ -589,11 +605,10 @@
 sub extract_message_metadata {
   my ($self, $main) = @_;
 
-  # do this only once
+  # do this only once per message, it can be expensive
   if ($self->{already_extracted_metadata}) { return; }
   $self->{already_extracted_metadata} = 1;
 
-  $self->{metadata} = Mail::SpamAssassin::MsgMetadata->new($self);
   $self->{metadata}->extract ($self, $main);
 }
 
@@ -605,7 +620,7 @@
 
 sub get_metadata {
   my ($self, $hdr) = @_;
-  $self->{meta_strings}->{$hdr};
+  $self->{metadata}->{strings}->{$hdr};
 }
 
 =item put_metadata($hdr, $text)
@@ -614,7 +629,7 @@
 
 sub put_metadata {
   my ($self, $hdr, $text) = @_;
-  $self->{meta_strings}->{$hdr} = $text;
+  $self->{metadata}->{strings}->{$hdr} = $text;
 }
 
 =item delete_metadata($hdr)
@@ -623,7 +638,7 @@
 
 sub delete_metadata {
   my ($self, $hdr) = @_;
-  delete $self->{meta_strings}->{$hdr};
+  delete $self->{metadata}->{strings}->{$hdr};
 }
 
 =item $str = get_all_metadata()
@@ -634,13 +649,27 @@
   my ($self) = @_;
 
   my @ret = ();
-  foreach my $key (sort keys %{$self->{meta_strings}}) {
-    push (@ret, $key, ": ", $self->{meta_strings}->{$key}, "\n");
+  foreach my $key (sort keys %{$self->{metadata}->{strings}}) {
+    push (@ret, $key, ": ", $self->{metadata}->{strings}->{$key}, "\n");
   }
   return join ("", @ret);
 }
 
 # ---------------------------------------------------------------------------
+
+=item finish_metadata()
+
+Destroys the metadata for this message.  Once a message has been
+scanned fully, the metadata is no longer required.   Destroying
+this will free up some memory.
+
+=cut
+
+sub finish_metadata {
+  my ($self) = @_;
+  $self->{metadata}->finish();
+  delete $self->{metadata};
+}
 
 =item finish()
 

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgMetadata.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgMetadata.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/MsgMetadata.pm	Thu Feb 19 10:41:47 2004
@@ -24,7 +24,22 @@
 
 =head1 DESCRIPTION
 
-This module will extract metadata from an email message.
+This class is tasked with extracting "metadata" from messages for use as
+Bayes tokens, fodder for eval tests, or other rules.  Metadata is
+supplemental data inferred from the message, like the examples below.
+
+It is held in two forms:
+
+1. as name-value pairs of strings, presented in mail header format.  For
+  example, "X-Language" => "en".  This is the general form for simple
+  metadata that's useful as Bayes tokens, can be added to marked-up
+  messages using "add_header", etc., such as the trusted-relay inference
+  and language detection.
+
+2. as more complex data structures on the $msg->{metadata} object.  This
+  is the form used for metadata like the HTML parse data, which is stored
+  there for access by eval rule code.   Because it's not simple strings,
+  it's not added as a Bayes token by default (Bayes needs simple strings).
 
 =head1 PUBLIC METHODS
 
@@ -48,9 +63,12 @@
 sub new {
   my ($class, $msg) = @_;
   $class = ref($class) || $class;
+
   my $self = {
-    msg => $msg
+    msg =>		$msg,
+    strings =>		{ }
   };
+
   bless($self,$class);
   $self;
 }
@@ -99,8 +117,12 @@
   $body = join ("\n", @{$body});
   $body =~ s/^Subject://i;
 
+  # note body text length, since the check_languages() eval rule also
+  # uses it
+  $self->{languages_body_len} = length($body);
+
   # need about 256 bytes for reasonably accurate match (experimentally derived)
-  if (length($body) < 256)
+  if ($self->{languages_body_len} < 256)
   {
     dbg("Message too short for language analysis");
     return;

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm	Thu Feb 19 10:41:47 2004
@@ -196,9 +196,9 @@
     $self->{score} += $self->{learned_points};
   }
 
+  # delete temporary storage and memory allocation used during checking
   $self->delete_fulltext_tmpfile();
 
-
   # Round the score to 3 decimal places to avoid rounding issues
   # We assume required_score to be properly rounded already.
   # add 0 to force it back to numeric representation instead of string.
@@ -221,6 +221,7 @@
 
   $report =~ s/\n*$/\n\n/s;
   $self->{report} = $report;
+  $self->{msg}->finish_metadata();
 
   $self->{main}->call_plugins ("check_end", { permsgstatus => $self });
 }