You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by qu...@apache.org on 2004/05/14 01:02:15 UTC
svn commit: rev 10629 - in incubator/spamassassin/trunk: lib/Mail/SpamAssassin rules
Author: quinlan
Date: Thu May 13 16:02:15 2004
New Revision: 10629
Modified:
incubator/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm
incubator/spamassassin/trunk/rules/20_html_tests.cf
Log:
clean up anchor code - no additional ham hits, spam hits changed as follows:
HTML_LINK_CLICK_CAPS increased from 3.14% to 3.82%
HTML_LINK_CLICK_HERE increased from 15.16% to 19.71%
HTML_LINK_PUSH_HERE increased from 2.02% to 2.23%
Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm (original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm Thu May 13 16:02:15 2004
@@ -112,6 +112,7 @@
$self->{html}{image_area} = 0;
$self->{html}{shouting} = 0;
$self->{html}{max_shouting} = 0;
+ $self->{html}{anchor_index} = -1;
$self->{html}{title_index} = -1;
$self->{html}{max_size} = 3; # start at default size
$self->{html}{min_size} = 3; # start at default size
@@ -859,16 +860,19 @@
if ($tag =~ /^(?:object|embed)$/) {
$self->{html}{embeds} = 1;
}
+
+ # special text delimiters - <a> and <title>
+ if ($tag eq "a") {
+ $self->{html}{anchor_index}++;
+ $self->{html}{anchor}->[$self->{html}{anchor_index}] = "";
+ }
if ($tag eq "title") {
$self->{html}{title_index}++;
$self->{html}{title}->[$self->{html}{title_index}] = "";
- # begin test code
- if ($self->{html}{title_index} > 0) {
- $self->{html}{title_extra}++;
- }
- # end test code
+ $self->{html}{title_extra}++ if $self->{html}{title_index} > 0;
}
+
if ($tag eq "meta" &&
exists $attr->{'http-equiv'} &&
exists $attr->{content} &&
@@ -877,8 +881,6 @@
{
$self->{html}{charsets} .= exists $self->{html}{charsets} ? " $1" : $1;
}
-
- $self->{html}{anchor_text} ||= "" if ($tag eq "a");
}
sub examine_text_style {
@@ -893,24 +895,17 @@
sub html_text {
my ($self, $text) = @_;
- if (exists $self->{html}{inside_a} && $self->{html}{inside_a} > 0) {
- $self->{html}{anchor_text} .= " $text";
- }
-
+ # text that is not part of body
if (exists $self->{html}{inside_script} && $self->{html}{inside_script} > 0)
{
- if ($text =~ /\b(?:$events)\b/io)
- {
- $self->{html}{html_event} = 1;
- }
if ($text =~ /\bon(?:blur|contextmenu|focus|load|resize|submit|unload)\b/i)
{
$self->{html}{html_event_unsafe} = 1;
}
+ if ($text =~ /\b(?:$events)\b/io) { $self->{html}{html_event} = 1; }
if ($text =~ /\.open\s*\(/) { $self->{html}{window_open} = 1; }
return;
}
-
if (exists $self->{html}{inside_style} && $self->{html}{inside_style} > 0) {
if ($text =~ /font(?:-size)?:\s*(\d+(?:\.\d*)?|\.\d+)(p[tx])/i) {
$self->examine_text_style ($1, $2);
@@ -918,8 +913,11 @@
return;
}
- if (exists $self->{html}{inside_title} && $self->{html}{inside_title} > 0)
- {
+ # text that is part of body and also stored separately
+ if (exists $self->{html}{inside_a} && $self->{html}{inside_a} > 0) {
+ $self->{html}{anchor}->[$self->{html}{anchor_index}] .= $text;
+ }
+ if (exists $self->{html}{inside_title} && $self->{html}{inside_title} > 0) {
$self->{html}{title}->[$self->{html}{title_index}] .= $text;
}
Modified: incubator/spamassassin/trunk/rules/20_html_tests.cf
==============================================================================
--- incubator/spamassassin/trunk/rules/20_html_tests.cf (original)
+++ incubator/spamassassin/trunk/rules/20_html_tests.cf Thu May 13 16:02:15 2004
@@ -170,13 +170,13 @@
body HTML_JAVASCRIPT eval:html_test('javascript')
describe HTML_JAVASCRIPT JavaScript code
-body HTML_LINK_PUSH_HERE eval:html_eval('anchor_text', '=~ /(?:push|go)\s*(?:here|this)/i')
+body HTML_LINK_PUSH_HERE eval:html_text('anchor', '=~ /(?:push|go)\s*(?:here|this)/i')
describe HTML_LINK_PUSH_HERE HTML link text says "push here" or similar
-body HTML_LINK_CLICK_HERE eval:html_eval('anchor_text', '=~ /click\s*(?:here|this)/i')
+body HTML_LINK_CLICK_HERE eval:html_text('anchor', '=~ /click\s*(?:here|this)/i')
describe HTML_LINK_CLICK_HERE HTML link text says "click here"
-body HTML_LINK_CLICK_CAPS eval:html_eval('anchor_text', '=~ /CLICK/')
+body HTML_LINK_CLICK_CAPS eval:html_text('anchor', '=~ /CLICK/')
describe HTML_LINK_CLICK_CAPS HTML link text says "CLICK"
# HTML obfuscation