You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by he...@apache.org on 2022/03/03 08:39:19 UTC

svn commit: r1898557 - /spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/PDFInfo.pm

Author: hege
Date: Thu Mar  3 08:39:19 2022
New Revision: 1898557

URL: http://svn.apache.org/viewvc?rev=1898557&view=rev
Log:
Bug 7960 - PDFInfo misses valid metadata

Modified:
    spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/PDFInfo.pm

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/PDFInfo.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/PDFInfo.pm?rev=1898557&r1=1898556&r2=1898557&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/PDFInfo.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/PDFInfo.pm Thu Mar  3 08:39:19 2022
@@ -215,7 +215,6 @@ my %get_details = (
     my $no_more_fuzzy = 0;
     my $got_image = 0;
     my $encrypted = 0;
-    my $location;
 
     while($data =~ /([^\n]+)/g) {
       # dbg("pdfinfo: line=$1");
@@ -228,25 +227,11 @@ my %get_details = (
           $line =~ s/\s+$//;  # strip off whitespace at end.
           $fuzzy_data .= $line;
 	}
+	# once we hit the first stream, we stop collecting data for fuzzy md5
+	$no_more_fuzzy = 1  if index($line, 'stream') >= 0;
       }
 
-      if ($line =~ m/^\/([A-Za-z]+)/) {
-         $pdf_tags .= $1;
-      }
-
-      $got_image=1 if ($line =~ m/\/Image/);
-      $encrypted=1 if ($line =~ m/^\/Encrypt/);
-
-      # once we hit the first stream, we stop collecting data for fuzzy md5
-      $no_more_fuzzy = 1 if (index($line, 'stream') >= 0);
-
-      # XXX some pdf have uris but are stored inside binary data
-      if ($line =~ /\/S\s?\/URI\s?\/URI\s?\(([^\)\\]+)\)\s?/) {
-         $location = $1;
-         dbg("pdfinfo: found URI $location in pdf " . ($name ? $name : ''));
-         $pms->add_uri_detail_list($location);
-         undef $location;
-      }
+      $got_image = 1  if index($line, '/Image') >= 0;
 
       # From a v1.3 pdf
       # [12234] dbg: pdfinfo: line=630 0 0 149 0 0 cm
@@ -267,21 +252,37 @@ my %get_details = (
           $width = $1;
           $height = $2;
         }
+        if ($width && $height) {
+          $no_more_fuzzy = 1;
+          my $area = $width * $height;
+          $total_height += $height;
+          $total_width += $width;
+          $total_area += $area;
+          $pms->{pdfinfo}->{dems_pdf}->{"${height}x${width}"} = 1;
+          $pms->{'pdfinfo'}->{"count_pdf_images"} ++;
+          dbg("pdfinfo: Found image in PDF ".($name ? $name : '')." - $height x $width pixels ($area pixels sq.)");
+          $self->_set_tag($pms, 'PDFIMGDIM', "${height}x${width}");
+          $height=0; $width=0;  # reset and check for next image
+          $got_image = 0;
+        }
+      }
+
+      #
+      # Triage - expecting / to be found for rest of the checks
+      #
+      next unless index($line, '/') >= 0;
+
+      $encrypted = 1  if index($line, '/Encrypt') == 0;
+
+      if ($line =~ m/^\/([A-Za-z]+)/) {
+         $pdf_tags .= $1;
       }
 
-      # did pdf contain image data?
-      if ($got_image && $width && $height) {
-        $no_more_fuzzy = 1;
-        my $area = $width * $height;
-        $total_height += $height;
-        $total_width += $width;
-        $total_area += $area;
-        $pms->{pdfinfo}->{dems_pdf}->{"${height}x${width}"} = 1;
-        $pms->{'pdfinfo'}->{"count_pdf_images"} ++;
-        dbg("pdfinfo: Found image in PDF ".($name ? $name : '')." - $height x $width pixels ($area pixels sq.)");
-        $self->_set_tag($pms, 'PDFIMGDIM', "${height}x${width}");
-        $height=0; $width=0;  # reset and check for next image
-        $got_image = 0;
+      # XXX some pdf have uris but are stored inside binary data
+      if ($line =~ /\/S\s?\/URI\s?\/URI\s?\(([^\)\\]+)\)\s?/) {
+         my $location = $1;
+         dbg("pdfinfo: found URI $location in pdf " . ($name ? $name : ''));
+         $pms->add_uri_detail_list($location);
       }
 
       # [5310] dbg: pdfinfo: line=<</Producer(GPL Ghostscript 8.15)
@@ -292,30 +293,23 @@ my %get_details = (
       # [5310] dbg: pdfinfo: line=/Author(colet)>>endobj
       # or all on same line inside xml - v1.6+
       # <</CreationDate(D:20070226165054-06'00')/Creator( Adobe Photoshop CS2 Windows)/Producer(Adobe Photoshop for Windows -- Image Conversion Plug-in)/ModDate(D:20070226165100-06'00')>>
-
-      if ($line =~ /\/Producer\s?\(([^\)\\]+)/) {
-        $producer = $1;
+      if ($line =~ /\/Producer\s{0,2}\((.*?(?<!\\))\)/) {
+        $producer = clean_property($1);
       }
-      if ($line =~ /\/CreationDate\s?\(D\:(\d+)/) {
-        $created = $1;
+      if ($line =~ /\/CreationDate\s{0,2}\(D\:(\d+)/) {
+        $created = clean_property($1);
       }
-      if ($line =~ /\/ModDate\s?\(D\:(\d+)/) {
-        $modified = $1;
+      if ($line =~ /\/ModDate\s{0,2}\(D\:(\d+)/) {
+        $modified = clean_property($1);
       }
-      if ($line =~ /\/Title\s?\(([^\)\\]+)/) {
-        $title = $1;
-        # Title=\376\377\000w\000w\000n\000g
-        # Title=wwng
-        $title =~ s/\\\d{3}//g;
-      }
-      if ($line =~ /\/Creator\s?\(([^\)\\]+)/) {
-        $creator = $1;
-      }
-      if ($line =~ /\/Author\s?\(([^\)]+)/) {
-        $author = $1;
-        # Author=\376\377\000H\000P\000_\000A\000d\000m\000i\000n\000i\000s\000t\000r\000a\000t\000o\000r
-        # Author=HP_Administrator
-        $author =~ s/\\\d{3}//g;
+      if ($line =~ /\/Title\s{0,2}\((.*?(?<!\\))\)/) {
+        $title = clean_property($1);
+      }
+      if ($line =~ /\/Creator\s{0,2}\((.*?(?<!\\))\)/) {
+        $creator = clean_property($1);
+      }
+      if ($line =~ /\/Author\s{0,2}\((.*?(?<!\\))\)/) {
+        $author = clean_property($1);
       }
     }
 
@@ -380,6 +374,21 @@ my %get_details = (
 
 );
 
+sub clean_property {
+  local $_ = shift;
+  # Author=\376\377\000H\000P\000_\000A\000d\000m\000i\000n\000i\000s\000t\000r\000a\000t\000o\000r
+  # Handle UTF-16 (in ultra-naive way for now)
+  if (s/^\xfe\xff//) {
+    s/\x00//g;
+  } elsif (s/^\\376\\377//) {
+    s/\\00?0?//g;
+  }
+  # Fix quoted parenthesis:
+  # Title(Foo \(bar\))
+  s/\\([()])/$1/g;
+  return $_;
+}
+
 # ----------------------------------------
 
 sub _set_tag {