You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by he...@apache.org on 2022/10/18 08:24:19 UTC
svn commit: r1904676 - in /spamassassin/trunk/lib/Mail/SpamAssassin: Message.pm PerMsgStatus.pm Plugin/BodyEval.pm Plugin/HTMLEval.pm Plugin/HTTPSMismatch.pm Plugin/URIEval.pm
Author: hege
Date: Tue Oct 18 08:24:19 2022
New Revision: 1904676
URL: http://svn.apache.org/viewvc?rev=1904676&view=rev
Log:
Bug 8063 - uri not detected if two text/html parts exist
Modified:
spamassassin/trunk/lib/Mail/SpamAssassin/Message.pm
spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/BodyEval.pm
spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/HTMLEval.pm
spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/HTTPSMismatch.pm
spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIEval.pm
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Message.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Message.pm?rev=1904676&r1=1904675&r2=1904676&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Message.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Message.pm Tue Oct 18 08:24:19 2022
@@ -1303,6 +1303,7 @@ sub get_body_text_array_common {
# text/plain rendered as html otherwise.
if ($html_needs_setting && $type eq 'text/html') {
$self->{metadata}->{html} = $p->{html_results};
+ push @{$self->{metadata}->{html_all}}, $p->{html_results};
}
}
}
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm?rev=1904676&r1=1904675&r2=1904676&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm Tue Oct 18 08:24:19 2022
@@ -2043,6 +2043,7 @@ sub extract_message_metadata {
$self->get_decoded_stripped_body_text_array();
}
$self->{html} = $self->{msg}->{metadata}->{html};
+ $self->{html_all} = $self->{msg}->{metadata}->{html_all};
# allow plugins to add more metadata, read the stuff that's there, etc.
$self->{main}->call_plugins ("parsed_metadata", { permsgstatus => $self });
@@ -2788,18 +2789,20 @@ sub _process_html_uri_list {
my ($self) = @_;
# get URIs from HTML parsing
- # use the metadata version since $self->{html} may not be setup
- my $detail = $self->{msg}->{metadata}->{html}->{uri_detail} || { };
- $self->{'uri_truncated'} = 1 if $self->{msg}->{metadata}->{html}->{uri_truncated};
-
- # canonicalize the HTML parsed URIs
- while(my($uri, $info) = each %{ $detail }) {
- if ($self->add_uri_detail_list($uri, $info->{types}, 'html', 0)) {
- # Need also to copy and uniq anchor text
- if (exists $info->{anchor_text}) {
- my %seen;
- foreach (grep { !$seen{$_}++ } @{$info->{anchor_text}}) {
- push @{$self->{uri_detail_list}->{$uri}->{anchor_text}}, $_;
+ # use the metadata version since $self->{html_all} may not be setup
+ foreach my $html (@{$self->{msg}->{metadata}->{html_all}}) {
+ my $detail = $html->{uri_detail} || { };
+ $self->{'uri_truncated'} = 1 if $html->{uri_truncated};
+
+ # canonicalize the HTML parsed URIs
+ while(my($uri, $info) = each %{ $detail }) {
+ if ($self->add_uri_detail_list($uri, $info->{types}, 'html', 0)) {
+ # Need also to copy and uniq anchor text
+ if (exists $info->{anchor_text}) {
+ my %seen;
+ foreach (grep { !$seen{$_}++ } @{$info->{anchor_text}}) {
+ push @{$self->{uri_detail_list}->{$uri}->{anchor_text}}, $_;
+ }
}
}
}
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/BodyEval.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/BodyEval.pm?rev=1904676&r1=1904675&r2=1904676&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/BodyEval.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/BodyEval.pm Tue Oct 18 08:24:19 2022
@@ -123,7 +123,7 @@ sub _multipart_alternative_difference {
}
# If there are no words, mark if there's at least 1 image ...
- if (!%html && exists $pms->{html}{inside}{img}) {
+ if (!%html && exists $text->{html_results}{inside}{img}) {
# Use "\n" as the mark since it can't ever occur normally
$html{"\n"}=1;
}
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/HTMLEval.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/HTMLEval.pm?rev=1904676&r1=1904675&r2=1904676&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/HTMLEval.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/HTMLEval.pm Tue Oct 18 08:24:19 2022
@@ -61,65 +61,88 @@ sub html_tag_balance {
return 0 if $rawtag !~ /^([a-zA-Z0-9]+)$/;
my $tag = $1;
- return 0 unless exists $pms->{html}{inside}{$tag};
-
return 0 if $rawexpr !~ /^([\<\>\=\!\-\+ 0-9]+)$/;
my $expr = untaint_var($1);
- $pms->{html}{inside}{$tag} =~ /^([\<\>\=\!\-\+ 0-9]+)$/;
- my $val = untaint_var($1);
+ foreach my $html (@{$pms->{html_all}}) {
+ next unless exists $html->{inside}{$tag};
+ $html->{inside}{$tag} =~ /^([\<\>\=\!\-\+ 0-9]+)$/;
+ my $val = untaint_var($1);
+ return 1 if eval "\$val $expr";
+ }
- return eval "\$val $expr";
+ return 0;
}
sub html_image_only {
my ($self, $pms, undef, $min, $max) = @_;
- return (exists $pms->{html}{inside}{img} &&
- exists $pms->{html}{length} &&
- $pms->{html}{length} > $min &&
- $pms->{html}{length} <= $max);
+ foreach my $html (@{$pms->{html_all}}) {
+ if (exists $html->{inside}{img} && exists $html->{length} &&
+ $html->{length} > $min && $html->{length} <= $max)
+ {
+ return 1;
+ }
+ }
+
+ return 0;
}
sub html_image_ratio {
my ($self, $pms, undef, $min, $max) = @_;
- return 0 unless (exists $pms->{html}{non_space_len} &&
- exists $pms->{html}{image_area} &&
- $pms->{html}{image_area} > 0);
- my $ratio = $pms->{html}{non_space_len} / $pms->{html}{image_area};
- return ($ratio > $min && $ratio <= $max);
+ foreach my $html (@{$pms->{html_all}}) {
+ next unless (exists $html->{non_space_len} &&
+ exists $html->{image_area} &&
+ $html->{image_area} > 0);
+ my $ratio = $html->{non_space_len} / $html->{image_area};
+ return 1 if $ratio > $min && $ratio <= $max;
+ }
+
+ return 0;
}
sub html_charset_faraway {
my ($self, $pms) = @_;
- return 0 unless exists $pms->{html}{charsets};
-
my @locales = Mail::SpamAssassin::Util::get_my_locales($pms->{conf}->{ok_locales});
return 0 if grep { $_ eq "all" } @locales;
- my $okay = 0;
- my $bad = 0;
- for my $c (split(' ', $pms->{html}{charsets})) {
- if (Mail::SpamAssassin::Locales::is_charset_ok_for_locales($c, @locales)) {
- $okay++;
- }
- else {
- $bad++;
+ foreach my $html (@{$pms->{html_all}}) {
+ next unless exists $html->{charsets};
+ my $okay = 0;
+ my $bad = 0;
+ foreach my $c (split(/\s+/, $html->{charsets})) {
+ if (Mail::SpamAssassin::Locales::is_charset_ok_for_locales($c, @locales)) {
+ $okay++;
+ } else {
+ $bad++;
+ }
}
+ return 1 if $bad && $bad >= $okay;
}
- return ($bad && ($bad >= $okay));
+
+ return 0;
}
sub html_tag_exists {
my ($self, $pms, undef, $tag) = @_;
- return exists $pms->{html}{inside}{$tag};
+
+ foreach my $html (@{$pms->{html_all}}) {
+ return 1 if exists $html->{inside}{$tag};
+ }
+
+ return 0;
}
sub html_test {
my ($self, $pms, undef, $test) = @_;
- return $pms->{html}{$test} ? 1 : 0;
+
+ foreach my $html (@{$pms->{html_all}}) {
+ return 1 if $html->{$test};
+ }
+
+ return 0;
}
sub html_eval {
@@ -128,29 +151,38 @@ sub html_eval {
return 0 if $rawexpr !~ /^([\<\>\=\!\-\+ 0-9]+)$/;
my $expr = untaint_var($1);
- # workaround bug 3320: weird perl bug where additional, very explicit
- # untainting into a new var is required.
- my $tainted = $pms->{html}{$test};
- return 0 unless defined($tainted);
- my $val = $tainted;
+ foreach my $html (@{$pms->{html_all}}) {
+ # workaround bug 3320: weird perl bug where additional, very explicit
+ # untainting into a new var is required.
+ my $tainted = $html->{$test};
+ next unless defined($tainted);
+ my $val = $tainted;
+ # just use the value in $val, don't copy it needlessly
+ return 1 if eval "\$val $expr";
+ }
- # just use the value in $val, don't copy it needlessly
- return eval "\$val $expr";
+ return 0;
}
sub html_text_match {
my ($self, $pms, undef, $text, $regexp) = @_;
+
my ($rec, $err) = compile_regexp($regexp, 0);
if (!$rec) {
warn "htmleval: html_text_match invalid regexp '$regexp': $err";
return 0;
}
- foreach my $string (@{$pms->{html}{$text}}) {
- next unless defined $string;
- if ($string =~ $rec) {
- return 1;
+
+ foreach my $html (@{$pms->{html_all}}) {
+ next unless ref($html->{$text}) eq 'ARRAY';
+ foreach my $string (@{$html->{$text}}) {
+ next unless defined $string;
+ if ($string =~ $rec) {
+ return 1;
+ }
}
}
+
return 0;
}
@@ -161,53 +193,73 @@ sub html_title_subject_ratio {
if ($subject eq '') {
return 0;
}
- my $max = 0;
- for my $string (@{ $pms->{html}{title} }) {
- if ($string) {
- my $ratio = length($string) / length($subject);
- $max = $ratio if $ratio > $max;
+
+ foreach my $html (@{$pms->{html_all}}) {
+ my $max = 0;
+ foreach my $string (@{$html->{title}}) {
+ if ($string) {
+ my $ratio_s = length($string) / length($subject);
+ $max = $ratio_s if $ratio_s > $max;
+ }
}
+ return 1 if $max > $ratio;
}
- return $max > $ratio;
+
+ return 0;
}
sub html_text_not_match {
my ($self, $pms, undef, $text, $regexp) = @_;
- for my $string (@{ $pms->{html}{$text} }) {
- if (defined $string && $string !~ /${regexp}/) {
- return 1;
+
+ my ($rec, $err) = compile_regexp($regexp, 0);
+ if (!$rec) {
+ warn "htmleval: html_text_not_match invalid regexp '$regexp': $err";
+ return 0;
+ }
+
+ foreach my $html (@{$pms->{html_all}}) {
+ next unless ref($html->{$text}) eq 'ARRAY';
+ foreach my $string (@{$html->{$text}}) {
+ if (defined $string && $string !~ $rec) {
+ return 1;
+ }
}
}
+
return 0;
}
sub html_range {
my ($self, $pms, undef, $test, $min, $max) = @_;
- return 0 unless exists $pms->{html}{$test};
-
- $test = $pms->{html}{$test};
-
- # not all perls understand what "inf" means, so we need to do
- # non-numeric tests! urg!
- if (!defined $max || $max eq "inf") {
- return ($test eq "inf") ? 1 : ($test > $min);
- }
- elsif ($test eq "inf") {
- # $max < inf, so $test == inf means $test > $max
- return 0;
- }
- else {
- # if we get here everything should be a number
- return ($test > $min && $test <= $max);
+ foreach my $html (@{$pms->{html_all}}) {
+ next unless defined $html->{$test};
+ my $value = $html->{$test};
+ # not all perls understand what "inf" means, so we need to do
+ # non-numeric tests! urg!
+ if (!defined $max || $max eq "inf") {
+ return 1 if $value > $min;
+ }
+ elsif ($value eq "inf") {
+ # $max < inf, so $value == inf means $value > $max
+ next;
+ }
+ else {
+ # if we get here everything should be a number
+ return 1 if $value > $min && $value <= $max;
+ }
}
+
+ return 0;
}
sub check_iframe_src {
my ($self, $pms) = @_;
- foreach my $v ( values %{$pms->{html}->{uri_detail}} ) {
- return 1 if $v->{types}->{iframe};
+ foreach my $html (@{$pms->{html_all}}) {
+ foreach my $v (values %{$html->{uri_detail}}) {
+ return 1 if $v->{types}->{iframe};
+ }
}
return 0;
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/HTTPSMismatch.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/HTTPSMismatch.pm?rev=1904676&r1=1904675&r2=1904676&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/HTTPSMismatch.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/HTTPSMismatch.pm Tue Oct 18 08:24:19 2022
@@ -47,39 +47,37 @@ sub new {
# ("<" and ">" replaced with "[lt]" and "[gt]" to avoid Kaspersky Desktop AV
# false positive ;)
sub check_https_http_mismatch {
- my ($self, $permsgstatus, undef, $minanchors, $maxanchors) = @_;
+ my ($self, $pms, undef, $minanchors, $maxanchors) = @_;
$minanchors ||= 1;
- if (!exists $permsgstatus->{chhm_hit}) {
- $permsgstatus->{chhm_hit} = 0;
- $permsgstatus->{chhm_anchors} = 0;
-
- foreach my $k ( keys %{$permsgstatus->{html}->{uri_detail}} ) {
- my %uri_detail = %{$permsgstatus->{html}->{uri_detail}};
- my $v = ${uri_detail}{$k};
+ foreach my $html (@{$pms->{html_all}}) {
+ my $hit = 0;
+ my $anchors = 0;
+ foreach my $k (keys %{$html->{uri_detail}}) {
+ my $v = $html->{uri_detail}->{$k};
+
# if the URI wasn't used for an anchor tag, or the anchor text didn't
# exist, skip this.
- next unless (exists $v->{anchor_text} && @{$v->{anchor_text}});
+ next unless exists $v->{anchor_text} && @{$v->{anchor_text}};
my $uri;
if ($k =~ m@^https?://([^/:?#]+)@i) {
$uri = $1;
# Skip IPs since there's another rule to catch that already
if ($uri =~ IS_IP_ADDRESS) {
- undef $uri;
+ $uri = undef;
next;
}
# want to compare whole hostnames instead of domains?
# comment this next section to the blank line.
$uri = $self->{main}->{registryboundaries}->trim_domain($uri);
my $domain = $self->{main}->{registryboundaries}->uri_to_domain($uri);
- undef $uri unless ($self->{main}->{registryboundaries}->is_domain_valid($domain));
+ $uri = undef unless $self->{main}->{registryboundaries}->is_domain_valid($domain);
}
-
next unless $uri;
- $permsgstatus->{chhm_anchors}++ if exists $v->{anchor_text};
+ $anchors++ if exists $v->{anchor_text};
foreach (@{$v->{anchor_text}}) {
if (m@https://([^\s/:?#]+)@i) {
my $https = $1;
@@ -88,22 +86,23 @@ sub check_https_http_mismatch {
# comment this next section to the blank line.
if ($https !~ IS_IP_ADDRESS) {
$https = $self->{main}->{registryboundaries}->trim_domain($https);
- undef $https unless ($self->{main}->{registryboundaries}->is_domain_valid($https));
+ $https = undef unless $self->{main}->{registryboundaries}->is_domain_valid($https);
}
next unless $https;
-
dbg("https_http_mismatch: domains $uri -> $https");
-
next if $uri eq $https;
- $permsgstatus->{chhm_hit} = 1;
+ $hit = 1;
last;
}
}
}
- dbg("https_http_mismatch: anchors ".$permsgstatus->{chhm_anchors});
+
+ dbg("https_http_mismatch: anchors $anchors");
+ return 1 if $hit && $anchors >= $minanchors &&
+ (!defined $maxanchors || $anchors < $maxanchors);
}
- return ( $permsgstatus->{chhm_hit} && $permsgstatus->{chhm_anchors} >= $minanchors && (defined $maxanchors && $permsgstatus->{chhm_anchors} < $maxanchors) );
+ return 0;
}
1;
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIEval.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIEval.pm?rev=1904676&r1=1904675&r2=1904676&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIEval.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIEval.pm Tue Oct 18 08:24:19 2022
@@ -54,7 +54,7 @@ sub check_for_http_redirector {
while (s{^https?://([^/:\?]+).+?(https?:/{0,2}?([^/:\?]+).*)$}{$2}i) {
my ($redir, $dest) = ($1, $3);
foreach ($redir, $dest) {
- $_ = $self->{main}->{registryboundaries}->uri_to_domain($_) || $_;
+ $_ = $self->{main}->{registryboundaries}->uri_to_domain($_) || $_;
}
next if ($redir eq $dest);
dbg("eval: redirect: found $redir to $dest, flagging");
@@ -69,13 +69,15 @@ sub check_for_http_redirector {
sub check_https_ip_mismatch {
my ($self, $pms) = @_;
- while (my($k,$v) = each %{$pms->{html}->{uri_detail}}) {
- next if ($k !~ m%^https?:/*(?:[^\@/]+\@)?\d+\.\d+\.\d+\.\d+%i);
- foreach (@{$v->{anchor_text}}) {
- next if (m%^https:/*(?:[^\@/]+\@)?\d+\.\d+\.\d+\.\d+%i);
- if (m%https:%i) {
- keys %{$self->{html}->{uri_detail}}; # resets iterator, bug 4829
- return 1;
+ foreach my $html (@{$pms->{html_all}}) {
+ foreach my $k (keys %{$html->{uri_detail}}) {
+ my $v = $html->{uri_detail}->{$k};
+ next if ($k !~ m%^https?:/*(?:[^\@/]+\@)?\d+\.\d+\.\d+\.\d+%i);
+ foreach (@{$v->{anchor_text}}) {
+ next if (m%^https:/*(?:[^\@/]+\@)?\d+\.\d+\.\d+\.\d+%i);
+ if (m%https:%i) {
+ return 1;
+ }
}
}
}