You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by qu...@apache.org on 2004/05/14 01:02:15 UTC

svn commit: rev 10629 - in incubator/spamassassin/trunk: lib/Mail/SpamAssassin rules

Author: quinlan
Date: Thu May 13 16:02:15 2004
New Revision: 10629

Modified:
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm
   incubator/spamassassin/trunk/rules/20_html_tests.cf
Log:
clean up anchor code - no additional ham hits, spam hits changed as follows:
  HTML_LINK_CLICK_CAPS increased from 3.14% to 3.82%
  HTML_LINK_CLICK_HERE increased from 15.16% to 19.71%
  HTML_LINK_PUSH_HERE increased from 2.02% to 2.23%


Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm	Thu May 13 16:02:15 2004
@@ -112,6 +112,7 @@
   $self->{html}{image_area} = 0;
   $self->{html}{shouting} = 0;
   $self->{html}{max_shouting} = 0;
+  $self->{html}{anchor_index} = -1;
   $self->{html}{title_index} = -1;
   $self->{html}{max_size} = 3;	# start at default size
   $self->{html}{min_size} = 3;	# start at default size
@@ -859,16 +860,19 @@
   if ($tag =~ /^(?:object|embed)$/) {
     $self->{html}{embeds} = 1;
   }
+
+  # special text delimiters - <a> and <title>
+  if ($tag eq "a") {
+    $self->{html}{anchor_index}++;
+    $self->{html}{anchor}->[$self->{html}{anchor_index}] = "";
+  }
   if ($tag eq "title") {
     $self->{html}{title_index}++;
     $self->{html}{title}->[$self->{html}{title_index}] = "";
 
-    # begin test code
-    if ($self->{html}{title_index} > 0) {
-      $self->{html}{title_extra}++;
-    }
-    # end test code
+    $self->{html}{title_extra}++ if $self->{html}{title_index} > 0;
   }
+
   if ($tag eq "meta" &&
       exists $attr->{'http-equiv'} &&
       exists $attr->{content} &&
@@ -877,8 +881,6 @@
   {
     $self->{html}{charsets} .= exists $self->{html}{charsets} ? " $1" : $1;
   }
-
-  $self->{html}{anchor_text} ||= "" if ($tag eq "a");
 }
 
 sub examine_text_style {
@@ -893,24 +895,17 @@
 sub html_text {
   my ($self, $text) = @_;
 
-  if (exists $self->{html}{inside_a} && $self->{html}{inside_a} > 0) {
-    $self->{html}{anchor_text} .= " $text";
-  }
-
+  # text that is not part of body
   if (exists $self->{html}{inside_script} && $self->{html}{inside_script} > 0)
   {
-    if ($text =~ /\b(?:$events)\b/io)
-    {
-      $self->{html}{html_event} = 1;
-    }
     if ($text =~ /\bon(?:blur|contextmenu|focus|load|resize|submit|unload)\b/i)
     {
       $self->{html}{html_event_unsafe} = 1;
     }
+    if ($text =~ /\b(?:$events)\b/io) { $self->{html}{html_event} = 1; }
     if ($text =~ /\.open\s*\(/) { $self->{html}{window_open} = 1; }
     return;
   }
-
   if (exists $self->{html}{inside_style} && $self->{html}{inside_style} > 0) {
     if ($text =~ /font(?:-size)?:\s*(\d+(?:\.\d*)?|\.\d+)(p[tx])/i) {
       $self->examine_text_style ($1, $2);
@@ -918,8 +913,11 @@
     return;
   }
 
-  if (exists $self->{html}{inside_title} && $self->{html}{inside_title} > 0)
-  {
+  # text that is part of body and also stored separately
+  if (exists $self->{html}{inside_a} && $self->{html}{inside_a} > 0) {
+    $self->{html}{anchor}->[$self->{html}{anchor_index}] .= $text;
+  }
+  if (exists $self->{html}{inside_title} && $self->{html}{inside_title} > 0) {
     $self->{html}{title}->[$self->{html}{title_index}] .= $text;
   }
 

Modified: incubator/spamassassin/trunk/rules/20_html_tests.cf
==============================================================================
--- incubator/spamassassin/trunk/rules/20_html_tests.cf	(original)
+++ incubator/spamassassin/trunk/rules/20_html_tests.cf	Thu May 13 16:02:15 2004
@@ -170,13 +170,13 @@
 body HTML_JAVASCRIPT		eval:html_test('javascript')
 describe HTML_JAVASCRIPT	JavaScript code
 
-body HTML_LINK_PUSH_HERE	eval:html_eval('anchor_text', '=~ /(?:push|go)\s*(?:here|this)/i')
+body HTML_LINK_PUSH_HERE	eval:html_text('anchor', '=~ /(?:push|go)\s*(?:here|this)/i')
 describe HTML_LINK_PUSH_HERE	HTML link text says "push here" or similar
 
-body HTML_LINK_CLICK_HERE	eval:html_eval('anchor_text', '=~ /click\s*(?:here|this)/i')
+body HTML_LINK_CLICK_HERE	eval:html_text('anchor', '=~ /click\s*(?:here|this)/i')
 describe HTML_LINK_CLICK_HERE	HTML link text says "click here"
 
-body HTML_LINK_CLICK_CAPS	eval:html_eval('anchor_text', '=~ /CLICK/')
+body HTML_LINK_CLICK_CAPS	eval:html_text('anchor', '=~ /CLICK/')
 describe HTML_LINK_CLICK_CAPS	HTML link text says "CLICK"
 
 # HTML obfuscation