You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by he...@apache.org on 2022/10/18 08:24:19 UTC
svn commit: r1904676 - in /spamassassin/trunk/lib/Mail/SpamAssassin: Message.pm PerMsgStatus.pm Plugin/BodyEval.pm Plugin/HTMLEval.pm Plugin/HTTPSMismatch.pm Plugin/URIEval.pm

Author: hege
Date: Tue Oct 18 08:24:19 2022
New Revision: 1904676

URL: http://svn.apache.org/viewvc?rev=1904676&view=rev
Log:
Bug 8063 - uri not detected if two text/html parts exist

Modified:
    spamassassin/trunk/lib/Mail/SpamAssassin/Message.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/BodyEval.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/HTMLEval.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/HTTPSMismatch.pm
    spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIEval.pm

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Message.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Message.pm?rev=1904676&r1=1904675&r2=1904676&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Message.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Message.pm Tue Oct 18 08:24:19 2022
@@ -1303,6 +1303,7 @@ sub get_body_text_array_common {
       # text/plain rendered as html otherwise.
       if ($html_needs_setting && $type eq 'text/html') {
         $self->{metadata}->{html} = $p->{html_results};
+        push @{$self->{metadata}->{html_all}}, $p->{html_results};
       }
     }
   }

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm?rev=1904676&r1=1904675&r2=1904676&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm Tue Oct 18 08:24:19 2022
@@ -2043,6 +2043,7 @@ sub extract_message_metadata {
     $self->get_decoded_stripped_body_text_array();
   }
   $self->{html} = $self->{msg}->{metadata}->{html};
+  $self->{html_all} = $self->{msg}->{metadata}->{html_all};
 
   # allow plugins to add more metadata, read the stuff that's there, etc.
   $self->{main}->call_plugins ("parsed_metadata", { permsgstatus => $self });
@@ -2788,18 +2789,20 @@ sub _process_html_uri_list {
   my ($self) = @_;
 
   # get URIs from HTML parsing
-  # use the metadata version since $self->{html} may not be setup
-  my $detail = $self->{msg}->{metadata}->{html}->{uri_detail} || { };
-  $self->{'uri_truncated'} = 1 if $self->{msg}->{metadata}->{html}->{uri_truncated};
-
-  # canonicalize the HTML parsed URIs
-  while(my($uri, $info) = each %{ $detail }) {
-    if ($self->add_uri_detail_list($uri, $info->{types}, 'html', 0)) {
-      # Need also to copy and uniq anchor text
-      if (exists $info->{anchor_text}) {
-        my %seen;
-        foreach (grep { !$seen{$_}++ } @{$info->{anchor_text}}) {
-          push @{$self->{uri_detail_list}->{$uri}->{anchor_text}}, $_;
+  # use the metadata version since $self->{html_all} may not be setup
+  foreach my $html (@{$self->{msg}->{metadata}->{html_all}}) {
+    my $detail = $html->{uri_detail} || { };
+    $self->{'uri_truncated'} = 1 if $html->{uri_truncated};
+
+    # canonicalize the HTML parsed URIs
+    while(my($uri, $info) = each %{ $detail }) {
+      if ($self->add_uri_detail_list($uri, $info->{types}, 'html', 0)) {
+        # Need also to copy and uniq anchor text
+        if (exists $info->{anchor_text}) {
+          my %seen;
+          foreach (grep { !$seen{$_}++ } @{$info->{anchor_text}}) {
+            push @{$self->{uri_detail_list}->{$uri}->{anchor_text}}, $_;
+          }
         }
       }
     }

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/BodyEval.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/BodyEval.pm?rev=1904676&r1=1904675&r2=1904676&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/BodyEval.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/BodyEval.pm Tue Oct 18 08:24:19 2022
@@ -123,7 +123,7 @@ sub _multipart_alternative_difference {
         }
 
 	# If there are no words, mark if there's at least 1 image ...
-	if (!%html && exists $pms->{html}{inside}{img}) {
+	if (!%html && exists $text->{html_results}{inside}{img}) {
 	  # Use "\n" as the mark since it can't ever occur normally
 	  $html{"\n"}=1;
 	}

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/HTMLEval.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/HTMLEval.pm?rev=1904676&r1=1904675&r2=1904676&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/HTMLEval.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/HTMLEval.pm Tue Oct 18 08:24:19 2022
@@ -61,65 +61,88 @@ sub html_tag_balance {
   return 0 if $rawtag !~ /^([a-zA-Z0-9]+)$/;
   my $tag = $1;
 
-  return 0 unless exists $pms->{html}{inside}{$tag};
-
   return 0 if $rawexpr !~ /^([\<\>\=\!\-\+ 0-9]+)$/;
   my $expr = untaint_var($1);
 
-  $pms->{html}{inside}{$tag} =~ /^([\<\>\=\!\-\+ 0-9]+)$/;
-  my $val = untaint_var($1);
+  foreach my $html (@{$pms->{html_all}}) {
+    next unless exists $html->{inside}{$tag};
+    $html->{inside}{$tag} =~ /^([\<\>\=\!\-\+ 0-9]+)$/;
+    my $val = untaint_var($1);
+    return 1 if eval "\$val $expr";
+  }
 
-  return eval "\$val $expr";
+  return 0;
 }
 
 sub html_image_only {
   my ($self, $pms, undef, $min, $max) = @_;
 
-  return (exists $pms->{html}{inside}{img} &&
-	  exists $pms->{html}{length} &&
-	  $pms->{html}{length} > $min &&
-	  $pms->{html}{length} <= $max);
+  foreach my $html (@{$pms->{html_all}}) {
+    if (exists $html->{inside}{img} && exists $html->{length} &&
+        $html->{length} > $min && $html->{length} <= $max)
+    {
+      return 1;
+    }
+  }
+
+  return 0;
 }
 
 sub html_image_ratio {
   my ($self, $pms, undef, $min, $max) = @_;
 
-  return 0 unless (exists $pms->{html}{non_space_len} &&
-		   exists $pms->{html}{image_area} &&
-		   $pms->{html}{image_area} > 0);
-  my $ratio = $pms->{html}{non_space_len} / $pms->{html}{image_area};
-  return ($ratio > $min && $ratio <= $max);
+  foreach my $html (@{$pms->{html_all}}) {
+    next unless (exists $html->{non_space_len} &&
+                 exists $html->{image_area} &&
+                 $html->{image_area} > 0);
+    my $ratio = $html->{non_space_len} / $html->{image_area};
+    return 1 if $ratio > $min && $ratio <= $max;
+  }
+
+  return 0;
 }
 
 sub html_charset_faraway {
   my ($self, $pms) = @_;
 
-  return 0 unless exists $pms->{html}{charsets};
-
   my @locales = Mail::SpamAssassin::Util::get_my_locales($pms->{conf}->{ok_locales});
   return 0 if grep { $_ eq "all" } @locales;
 
-  my $okay = 0;
-  my $bad = 0;
-  for my $c (split(' ', $pms->{html}{charsets})) {
-    if (Mail::SpamAssassin::Locales::is_charset_ok_for_locales($c, @locales)) {
-      $okay++;
-    }
-    else {
-      $bad++;
+  foreach my $html (@{$pms->{html_all}}) {
+    next unless exists $html->{charsets};
+    my $okay = 0;
+    my $bad = 0;
+    foreach my $c (split(/\s+/, $html->{charsets})) {
+      if (Mail::SpamAssassin::Locales::is_charset_ok_for_locales($c, @locales)) {
+        $okay++;
+      } else {
+        $bad++;
+      }
     }
+    return 1 if $bad && $bad >= $okay;
   }
-  return ($bad && ($bad >= $okay));
+
+  return 0;
 }
 
 sub html_tag_exists {
   my ($self, $pms, undef, $tag) = @_;
-  return exists $pms->{html}{inside}{$tag};
+
+  foreach my $html (@{$pms->{html_all}}) {
+    return 1 if exists $html->{inside}{$tag};
+  }
+
+  return 0;
 }
 
 sub html_test {
   my ($self, $pms, undef, $test) = @_;
-  return $pms->{html}{$test} ? 1 : 0;
+
+  foreach my $html (@{$pms->{html_all}}) {
+    return 1 if $html->{$test};
+  }
+
+  return 0;
 }
 
 sub html_eval {
@@ -128,29 +151,38 @@ sub html_eval {
   return 0 if $rawexpr !~ /^([\<\>\=\!\-\+ 0-9]+)$/;
   my $expr = untaint_var($1);
 
-  # workaround bug 3320: weird perl bug where additional, very explicit
-  # untainting into a new var is required.
-  my $tainted = $pms->{html}{$test};
-  return 0 unless defined($tainted);
-  my $val = $tainted;
+  foreach my $html (@{$pms->{html_all}}) {
+    # workaround bug 3320: weird perl bug where additional, very explicit
+    # untainting into a new var is required.
+    my $tainted = $html->{$test};
+    next unless defined($tainted);
+    my $val = $tainted;
+    # just use the value in $val, don't copy it needlessly
+    return 1 if eval "\$val $expr";
+  }
 
-  # just use the value in $val, don't copy it needlessly
-  return eval "\$val $expr";
+  return 0;
 }
 
 sub html_text_match {
   my ($self, $pms, undef, $text, $regexp) = @_;
+
   my ($rec, $err) = compile_regexp($regexp, 0);
   if (!$rec) {
     warn "htmleval: html_text_match invalid regexp '$regexp': $err";
     return 0;
   }
-  foreach my $string (@{$pms->{html}{$text}}) {
-    next unless defined $string;
-    if ($string =~ $rec) {
-      return 1;
+
+  foreach my $html (@{$pms->{html_all}}) {
+    next unless ref($html->{$text}) eq 'ARRAY';
+    foreach my $string (@{$html->{$text}}) {
+      next unless defined $string;
+      if ($string =~ $rec) {
+        return 1;
+      }
     }
   }
+
   return 0;
 }
 
@@ -161,53 +193,73 @@ sub html_title_subject_ratio {
   if ($subject eq '') {
     return 0;
   }
-  my $max = 0;
-  for my $string (@{ $pms->{html}{title} }) {
-    if ($string) {
-      my $ratio = length($string) / length($subject);
-      $max = $ratio if $ratio > $max;
+
+  foreach my $html (@{$pms->{html_all}}) {
+    my $max = 0;
+    foreach my $string (@{$html->{title}}) {
+      if ($string) {
+        my $ratio_s = length($string) / length($subject);
+        $max = $ratio_s if $ratio_s > $max;
+      }
     }
+    return 1 if $max > $ratio;
   }
-  return $max > $ratio;
+
+  return 0;
 }
 
 sub html_text_not_match {
   my ($self, $pms, undef, $text, $regexp) = @_;
-  for my $string (@{ $pms->{html}{$text} }) {
-    if (defined $string && $string !~ /${regexp}/) {
-      return 1;
+
+  my ($rec, $err) = compile_regexp($regexp, 0);
+  if (!$rec) {
+    warn "htmleval: html_text_not_match invalid regexp '$regexp': $err";
+    return 0;
+  }
+
+  foreach my $html (@{$pms->{html_all}}) {
+    next unless ref($html->{$text}) eq 'ARRAY';
+    foreach my $string (@{$html->{$text}}) {
+      if (defined $string && $string !~ $rec) {
+        return 1;
+      }
     }
   }
+
   return 0;
 }
 
 sub html_range {
   my ($self, $pms, undef, $test, $min, $max) = @_;
 
-  return 0 unless exists $pms->{html}{$test};
-
-  $test = $pms->{html}{$test};
-
-  # not all perls understand what "inf" means, so we need to do
-  # non-numeric tests!  urg!
-  if (!defined $max || $max eq "inf") {
-    return ($test eq "inf") ? 1 : ($test > $min);
-  }
-  elsif ($test eq "inf") {
-    # $max < inf, so $test == inf means $test > $max
-    return 0;
-  }
-  else {
-    # if we get here everything should be a number
-    return ($test > $min && $test <= $max);
+  foreach my $html (@{$pms->{html_all}}) {
+    next unless defined $html->{$test};
+    my $value = $html->{$test};
+    # not all perls understand what "inf" means, so we need to do
+    # non-numeric tests!  urg!
+    if (!defined $max || $max eq "inf") {
+      return 1 if $value > $min;
+    }
+    elsif ($value eq "inf") {
+      # $max < inf, so $value == inf means $value > $max
+      next;
+    }
+    else {
+      # if we get here everything should be a number
+      return 1 if $value > $min && $value <= $max;
+    }
   }
+
+  return 0;
 }
 
 sub check_iframe_src {
   my ($self, $pms) = @_;
 
-  foreach my $v ( values %{$pms->{html}->{uri_detail}} ) {
-    return 1 if $v->{types}->{iframe};
+  foreach my $html (@{$pms->{html_all}}) {
+    foreach my $v (values %{$html->{uri_detail}}) {
+      return 1 if $v->{types}->{iframe};
+    }
   }
 
   return 0;

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/HTTPSMismatch.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/HTTPSMismatch.pm?rev=1904676&r1=1904675&r2=1904676&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/HTTPSMismatch.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/HTTPSMismatch.pm Tue Oct 18 08:24:19 2022
@@ -47,39 +47,37 @@ sub new {
 # ("<" and ">" replaced with "[lt]" and "[gt]" to avoid Kaspersky Desktop AV
 # false positive ;)
 sub check_https_http_mismatch {
-  my ($self, $permsgstatus, undef, $minanchors, $maxanchors) = @_;
+  my ($self, $pms, undef, $minanchors, $maxanchors) = @_;
 
   $minanchors ||= 1;
 
-  if (!exists $permsgstatus->{chhm_hit}) {
-    $permsgstatus->{chhm_hit} = 0;
-    $permsgstatus->{chhm_anchors} = 0;
-
-    foreach my $k ( keys %{$permsgstatus->{html}->{uri_detail}} ) {
-      my %uri_detail = %{$permsgstatus->{html}->{uri_detail}};
-      my $v = ${uri_detail}{$k};
+  foreach my $html (@{$pms->{html_all}}) {
+    my $hit = 0;
+    my $anchors = 0;
+    foreach my $k (keys %{$html->{uri_detail}}) {
+      my $v = $html->{uri_detail}->{$k};
+
       # if the URI wasn't used for an anchor tag, or the anchor text didn't
       # exist, skip this.
-      next unless (exists $v->{anchor_text} && @{$v->{anchor_text}});
+      next unless exists $v->{anchor_text} && @{$v->{anchor_text}};
 
       my $uri;
       if ($k =~ m@^https?://([^/:?#]+)@i) {
         $uri = $1;
         # Skip IPs since there's another rule to catch that already
         if ($uri =~ IS_IP_ADDRESS) {
-          undef $uri;
+          $uri = undef;
           next;
         } 
         # want to compare whole hostnames instead of domains?
         # comment this next section to the blank line.
         $uri = $self->{main}->{registryboundaries}->trim_domain($uri);
         my $domain = $self->{main}->{registryboundaries}->uri_to_domain($uri);
-        undef $uri unless ($self->{main}->{registryboundaries}->is_domain_valid($domain));
+        $uri = undef  unless $self->{main}->{registryboundaries}->is_domain_valid($domain);
       }
-
       next unless $uri;
-      $permsgstatus->{chhm_anchors}++ if exists $v->{anchor_text};
 
+      $anchors++ if exists $v->{anchor_text};
       foreach (@{$v->{anchor_text}}) {
         if (m@https://([^\s/:?#]+)@i) {
           my $https = $1;
@@ -88,22 +86,23 @@ sub check_https_http_mismatch {
 	  # comment this next section to the blank line.
           if ($https !~ IS_IP_ADDRESS) {
 	    $https = $self->{main}->{registryboundaries}->trim_domain($https);
-            undef $https unless ($self->{main}->{registryboundaries}->is_domain_valid($https));
+            $https = undef  unless $self->{main}->{registryboundaries}->is_domain_valid($https);
           }
 	  next unless $https;
-
 	  dbg("https_http_mismatch: domains $uri -> $https");
-
 	  next if $uri eq $https;
-	  $permsgstatus->{chhm_hit} = 1;
+	  $hit = 1;
 	  last;
         }
       }
     }
-    dbg("https_http_mismatch: anchors ".$permsgstatus->{chhm_anchors});
+
+    dbg("https_http_mismatch: anchors $anchors");
+    return 1 if $hit && $anchors >= $minanchors &&
+                (!defined $maxanchors || $anchors < $maxanchors);
   }
 
-  return ( $permsgstatus->{chhm_hit} && $permsgstatus->{chhm_anchors} >= $minanchors && (defined $maxanchors && $permsgstatus->{chhm_anchors} < $maxanchors) );
+  return 0;
 }
 
 1;

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIEval.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIEval.pm?rev=1904676&r1=1904675&r2=1904676&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIEval.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/URIEval.pm Tue Oct 18 08:24:19 2022
@@ -54,7 +54,7 @@ sub check_for_http_redirector {
     while (s{^https?://([^/:\?]+).+?(https?:/{0,2}?([^/:\?]+).*)$}{$2}i) {
       my ($redir, $dest) = ($1, $3);
       foreach ($redir, $dest) {
-	$_ = $self->{main}->{registryboundaries}->uri_to_domain($_) || $_;
+        $_ = $self->{main}->{registryboundaries}->uri_to_domain($_) || $_;
       }
       next if ($redir eq $dest);
       dbg("eval: redirect: found $redir to $dest, flagging");
@@ -69,13 +69,15 @@ sub check_for_http_redirector {
 sub check_https_ip_mismatch {
   my ($self, $pms) = @_;
 
-  while (my($k,$v) = each %{$pms->{html}->{uri_detail}}) {
-    next if ($k !~ m%^https?:/*(?:[^\@/]+\@)?\d+\.\d+\.\d+\.\d+%i);
-    foreach (@{$v->{anchor_text}}) {
-      next if (m%^https:/*(?:[^\@/]+\@)?\d+\.\d+\.\d+\.\d+%i);
-      if (m%https:%i) {
-	keys %{$self->{html}->{uri_detail}}; # resets iterator, bug 4829
-	return 1;
+  foreach my $html (@{$pms->{html_all}}) {
+    foreach my $k (keys %{$html->{uri_detail}}) {
+      my $v = $html->{uri_detail}->{$k};
+      next if ($k !~ m%^https?:/*(?:[^\@/]+\@)?\d+\.\d+\.\d+\.\d+%i);
+      foreach (@{$v->{anchor_text}}) {
+        next if (m%^https:/*(?:[^\@/]+\@)?\d+\.\d+\.\d+\.\d+%i);
+        if (m%https:%i) {
+          return 1;
+        }
       }
     }
   }