You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by qu...@apache.org on 2004/05/16 08:47:23 UTC

svn commit: rev 10687 - incubator/spamassassin/trunk/lib/Mail/SpamAssassin

Author: quinlan
Date: Sat May 15 23:47:23 2004
New Revision: 10687

Modified:
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm
Log:
bug 3372: HTML parser confused by blank lines in paragraph
  buffer up text until a new whitespace tag is encountered,
  when printing the buffer, collapse whitespace reasonably correctly


Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/HTML.pm	Sat May 15 23:47:23 2004
@@ -76,12 +76,12 @@
   my $self = {};
   bless($self, $class);
 
-  $self->html_init();
+  $self->html_start();
 
   return $self;
 }
 
-sub html_init {
+sub html_start {
   my ($self) = @_;
 
   $self->{basefont} = 3;
@@ -94,6 +94,12 @@
   push @{ $self->{text_style} }, \%default;
 }
 
+sub html_end {
+  my ($self) = @_;
+
+  $self->display_text();
+}
+
 sub get_results {
   my ($self) = @_;
 
@@ -120,6 +126,9 @@
   $self->{html_text} = [];
   $self->{html_visible_text} = [];
   $self->{html_invisible_text} = [];
+  $self->{last_text} = "";
+  $self->{last_visible_text} = "";
+  $self->{last_invisible_text} = "";
   $self->{html_last_tag} = 0;
   $self->{html}{closed_html} = 0;
   $self->{html}{closed_body} = 0;
@@ -157,8 +166,9 @@
   my $hp = HTML::Parser->new(
 		api_version => 3,
 		handlers => [
-		  start_document => [sub { $self->html_init(@_) }],
+		  start_document => [sub { $self->html_start(@_) }],
 		  start => [sub { $self->html_tag(@_) }, "tagname,attr,'+1'"],
+		  end_document => [sub { $self->html_end(@_) }],
 		  end => [sub { $self->html_tag(@_) }, "tagname,attr,'-1'"],
 		  text => [sub { $self->html_text(@_) }, "dtext"],
 		  comment => [sub { $self->html_comment(@_) }, "text"],
@@ -218,13 +228,12 @@
       $self->html_format($tag, $attr, $num);
       $self->html_uri($tag, $attr, $num);
       $self->html_tests($tag, $attr, $num);
-      $self->{html_last_tag} = $tag;
     }
+    # end tags
     elsif ($num == -1) {
       $self->{html}{closed_html} = 1 if $tag eq "html";
       $self->{html}{closed_body} = 1 if $tag eq "body";
     }
-
     # shouting
     if ($tag =~ /^(?:b|i|u|strong|em|big|center|h\d)$/) {
       $self->{html}{shouting} += $num;
@@ -232,6 +241,8 @@
 	$self->{html}{max_shouting} = $self->{html}{shouting};
       }
     }
+
+    $self->{html_last_tag} = (($num < 0) ? "/" : "") . $tag;
   }
 }
 
@@ -240,16 +251,20 @@
 
   # ordered by frequency of tag groups
   if ($tag eq "br" || $tag eq "div") {
+    $self->display_text();
     push @{$self->{html_visible_text}}, "\n";
     push @{$self->{html_invisible_text}}, "\n";
     push @{$self->{html_text}}, "\n";
   }
-  elsif ($tag eq "li" || $tag eq "td" || $tag eq "dd") {
+  # should probably add th and dt here
+  elsif ($tag =~ /^(?:li|td|dd)$/) {
+    $self->display_text();
     push @{$self->{html_visible_text}}, " ";
     push @{$self->{html_invisible_text}}, " ";
     push @{$self->{html_text}}, " ";
   }
   elsif ($tag =~ /^(?:p|hr|blockquote|pre)$/) {
+    $self->display_text();
     push @{$self->{html_visible_text}}, "\n\n";
     push @{$self->{html_invisible_text}}, "\n\n";
     push @{$self->{html_text}}, "\n\n";
@@ -649,6 +664,7 @@
 sub css_style {
   my ($self, $tag, $attr, $num) = @_;
 
+  # TODO: something here
 }
 
 # body, font, table, tr, th, td, big, small
@@ -892,6 +908,19 @@
   $self->{html}{big_font} = 1 if ($type eq "px" && $size > 18);
 }
 
+sub display_text {
+  my ($self) = @_;
+
+  for my $type ('text', 'visible_text', 'invisible_text') {
+    my $text = $self->{"last_$type"};
+    $text =~ s/[ \t\n\r\f\x0b\xa0]+/ /g;
+    $text =~ s/^ //;
+    $text =~ s/ $//;
+    push @{$self->{"html_$type"}}, $text;
+    $self->{"last_$type"} = "";
+  }
+}
+
 sub html_text {
   my ($self, $text) = @_;
 
@@ -928,20 +957,18 @@
     $self->{html}{text_after_html} = 1 if $self->{html}{closed_html};
   }
 
-  $text =~ s/^\n//s if $self->{html_last_tag} eq "br";
-
-  if (defined $self->{html_text}[-1]) {
-    my $last = $self->{html_text}[-1];
-
+  if ($self->{last_text}) {
     # ideas discarded since they would be easy to evade:
     # 1. using \w or [A-Za-z] instead of \S or non-punctuation
     # 2. exempting certain tags
     if ($text =~ /^[^\s\x21-\x2f\x3a-\x40\x5b-\x60\x7b-\x7e]/s &&
-	$last =~ /[^\s\x21-\x2f\x3a-\x40\x5b-\x60\x7b-\x7e]\z/s)
+	$self->{last_text} =~ /[^\s\x21-\x2f\x3a-\x40\x5b-\x60\x7b-\x7e]\z/s)
     {
       $self->{html}{obfuscation}++;
     }
-    if ($last =~ /\b([^\s\x21-\x2f\x3a-\x40\x5b-\x60\x7b-\x7e]{1,7})\z/s) {
+    if ($self->{last_text} =~
+	/\b([^\s\x21-\x2f\x3a-\x40\x5b-\x60\x7b-\x7e]{1,7})\z/s)
+    {
       my $start = length($1);
       if ($text =~ /^([^\s\x21-\x2f\x3a-\x40\x5b-\x60\x7b-\x7e]{1,7})\b/s) {
 	my $backhair = $start . "_" . length($1);
@@ -952,12 +979,12 @@
   }
 
   if ($visible_for_bayes) {
-    push @{$self->{html_visible_text}}, $text;
+    $self->{last_visible_text} .= $text;
   }
   else {
-    push @{$self->{html_invisible_text}}, $text;
+    $self->{last_invisible_text} .= $text;
   }
-  push @{$self->{html_text}}, $text;
+  $self->{last_text} .= $text;
 }
 
 # note: $text includes <!-- and -->