You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by ax...@apache.org on 2014/12/02 21:16:48 UTC
svn commit: r1643003 - /spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/PDFInfo.pm

Author: axb
Date: Tue Dec  2 20:16:48 2014
New Revision: 1643003

URL: http://svn.apache.org/r1643003
Log:
saving PDFInfo.pm plugin

Added:
    spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/PDFInfo.pm   (with props)

Added: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/PDFInfo.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/PDFInfo.pm?rev=1643003&view=auto
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/PDFInfo.pm (added)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/PDFInfo.pm Tue Dec  2 20:16:48 2014
@@ -0,0 +1,743 @@
+# <@LICENSE>
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at:
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </...@LICENSE>
+
+=head1 NAME
+
+Mail::SpamAssassin::Plugin::PDFInfo - PDFInfo Plugin for SpamAssassin
+
+=head1 SYNOPSIS
+
+loadplugin     Mail::SpamAssassin::Plugin::PDFInfo
+
+=head1 DESCRIPTION
+
+This plugin helps detected spam using attached PDF files
+
+=item See "Usage:" below - more documentation see 20_pdfinfo.cf
+
+# Original info kept for history.
+# -------------------------------------------------------
+# PDFInfo Plugin for SpamAssassin
+# Version: 0.8
+# Info: $Id: PDFInfo.pm 904 2007-08-12 01:36:23Z root $
+# Created: 2007-08-10
+# Modified: 2007-08-10
+# By: Dallas Engelken
+#
+#
+# Changes:
+#   0.8 - added .fdf detection (thanks John Lundin) [axb]
+#   0.7 - fixed empty body/pdf count buglet(thanks Jeremy) [axb]
+#   0.6 - added support for tags - PDFCOUNT, PDFVERSION, PDFPRODUCER, etc.
+#       - fixed issue on perl 5.6.1 where pdf_match_details() failed to call
+#         _find_pdf_mime_parts(), resulting in no detection of pdf mime parts.
+#       - quoted-printable support - requires MIME::QuotedPrint (which should be in everyones
+#         install as a part of the MIME-Base64 package which is a SA req)
+#       - added simple pdf_is_empty_body() function with counts the body bytes minus the
+#         subject line.  can add optional <bytes> param if you need to allow for a few bytes.
+#   0.5 - fix warns for undef $pdf_tags
+#       - remove { } and \ before running eval in pdf_match_details to avoid eval error
+#   0.4 - added pdf_is_encrypted() function
+#       - added option to look for image HxW on same line
+#   0.3 - added 2nd fuzzy md5 which uses pdf tag layout as data
+#       - renamed pdf_image_named() to pdf_named()
+#          - PDF images are encapsulated and have no names.  We are matching the PDF file name.
+#       - renamed pdf_image_name_regex() to pdf_name_regex()
+#          - PDF images are encapsulated and have no names.  We are matching the PDF file name.
+#       - changed pdf_image_count() a bit and added pdf_count().
+#          - pdf_count() checks how many pdf attachments there are on the mail
+#          - pdf_image_count() checks how many images are found within all pdfs in the mail.
+#       - removed the restriction of the pdf containing an image in order to md5 it.
+#       - added pdf_match_details() function to check the following 'details'
+#          - author: Author of PDF if specified
+#          - producer: Software used to produce PDF
+#          - creator: Software used to produce PDF, usually similar to producer
+#          - title: Title of PDF
+#          - created: Creation Date
+#          - modified: Last Modified
+#   0.2 - support PDF octet-stream
+#   0.1 - just ported over the imageinfo code, and renamed to pdfinfo.
+#         - removed all support for png, gif, and jpg from the code.
+#         - prepended pdf_ to all function names to avoid conflicts with ImageInfo in SA 3.2.
+#
+#
+# Usage:
+#
+#  pdf_count()
+#
+#     body RULENAME  eval:pdf_count(<min>,[max])
+#        min: required, message contains at least x pdf mime parts
+#        max: optional, if specified, must not contain more than x pdf mime parts
+#
+#  pdf_image_count()
+#
+#     body RULENAME  eval:pdf_image_count(<min>,[max])
+#        min: required, message contains at least x images in pdf attachments.
+#        max: optional, if specified, must not contain more than x pdf images
+#
+#  pdf_pixel_coverage()
+#
+#     body RULENAME  eval:pdf_pixel_coverage(<min>,[max])
+#        min: required, message contains at least this much pixel area
+#        max: optional, if specified, message must not contain more than this much pixel area
+#
+#  pdf_named()
+#
+#     body RULENAME  eval:pdf_named(<string>)
+#        string: exact file name match, if you need partial match, see pdf_name_regex()
+#
+#  pdf_name_regex()
+#
+#     body RULENAME  eval:pdf_name_regex(<regex>)
+#        regex: regular expression, see examples in ruleset
+#
+#  pdf_match_md5()
+#
+#     body RULENAME  eval:pdf_match_md5(<string>)
+#        string: 32-byte md5 hex
+#
+#  pdf_match_fuzzy_md5()
+#
+#     body RULENAME  eval:pdf_match_md5(<string>)
+#        string: 32-byte md5 hex - see ruleset for obtaining the fuzzy md5
+#
+#  pdf_match_details()
+#
+#     body RULENAME  eval:pdf_match_details(<detail>,<regex>);
+#        detail: author, creator, created, modified, producer, title
+#        regex: regular expression, see examples in ruleset
+#
+#  pdf_is_encrypted()
+#
+#     body RULENAME eval:pdf_is_encrypted()
+#
+#  pdf_is_empty_body()
+#
+#     body RULENAME eval:pdf_is_empty_body(<bytes>)
+#        bytes: maximum byte count to allow and still consider it empty
+#
+#  NOTE: See the ruleset for more examples that are not documented here.
+#
+# -------------------------------------------------------
+
+package Mail::SpamAssassin::Plugin::PDFInfo;
+
+use Mail::SpamAssassin::Plugin;
+use Mail::SpamAssassin::Logger;
+use strict;
+use warnings;
+use bytes;
+use Digest::MD5 qw(md5_hex);
+use MIME::QuotedPrint;
+
+use vars qw(@ISA);
+@ISA = qw(Mail::SpamAssassin::Plugin);
+
+# constructor: register the eval rule
+sub new {
+  my $class = shift;
+  my $mailsaobject = shift;
+
+  # some boilerplate...
+  $class = ref($class) || $class;
+  my $self = $class->SUPER::new($mailsaobject);
+  bless ($self, $class);
+
+  $self->register_eval_rule ("pdf_count");
+  $self->register_eval_rule ("pdf_image_count");
+  $self->register_eval_rule ("pdf_pixel_coverage");
+  $self->register_eval_rule ("pdf_image_size_exact");
+  $self->register_eval_rule ("pdf_image_size_range");
+  $self->register_eval_rule ("pdf_named");
+  $self->register_eval_rule ("pdf_name_regex");
+  $self->register_eval_rule ("pdf_image_to_text_ratio");
+  $self->register_eval_rule ("pdf_match_md5");
+  $self->register_eval_rule ("pdf_match_fuzzy_md5");
+  $self->register_eval_rule ("pdf_match_details");
+  $self->register_eval_rule ("pdf_is_encrypted");
+  $self->register_eval_rule ("pdf_is_empty_body");
+
+  return $self;
+}
+
+# -----------------------------------------
+
+my %get_details = (
+  'pdf' => sub {
+    my ($self, $pms, $part) = @_;
+
+    my $type = $part->{'type'} || 'base64';
+    my $data = '';
+
+    if ($type eq 'quoted-printable') {
+      $data = decode_qp($data); # use QuotedPrint->decode_qp
+    }
+    else {
+      $data = $part->decode();  # just use built in base64 decoder
+    }
+
+    my $index = substr($data, 0, 8);
+
+    return unless ($index =~ /.PDF\-(\d\.\d)/);
+    my $version = $1;
+    $self->_set_tag($pms, 'PDFVERSION', $version);
+    # dbg("pdfinfo: pdf version = $version");
+
+    my ($height, $width, $fuzzy_data, $pdf_tags);
+    my ($producer, $created, $modified, $title, $creator, $author) = ('unknown','0','0','untitled','unknown','unknown');
+    my ($md5, $fuzzy_md5) = ('', '');
+    my ($total_height, $total_width, $total_area, $line_count) = (0,0,0,0);
+
+    my $name = $part->{'name'} || '';
+    $self->_set_tag($pms, 'PDFNAME', $name);
+
+    my $no_more_fuzzy = 0;
+    my $got_image = 0;
+    my $encrypted = 0;
+
+    while($data =~ /([^\n]+)/g) {
+      # dbg("pdfinfo: line=$1");
+      my $line = $1;
+
+      $line_count++;
+
+      # lines containing high bytes will have no data we need, so save some cycles
+      next if ($line =~ /[\x80-\xff]/);
+
+      if (!$no_more_fuzzy && $line_count < 70) {
+        if ($line !~ m/^\%/ && $line !~ m/^\/(?:Height|Width|(?:(?:Media|Crop)Box))/ && $line !~ m/^\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+cm$/) {
+          $line =~ s/\s+$//;  # strip off whitespace at end.
+          $fuzzy_data .= $line;
+	}
+      }
+
+      if ($line =~ m/^\/([A-Za-z]+)/) {
+         $pdf_tags .= $1;
+      }
+
+      $got_image=1 if ($line =~ m/\/Image/);
+      $encrypted=1 if ($line =~ m/^\/Encrypt/);
+
+      # once we hit the first stream, we stop collecting data for fuzzy md5
+      $no_more_fuzzy = 1 if ($line =~ m/stream/);
+
+      # From a v1.3 pdf
+      # [12234] dbg: pdfinfo: line=630 0 0 149 0 0 cm
+      # [12234] dbg: pdfinfo: line=/Width 630
+      # [12234] dbg: pdfinfo: line=/Height 149
+      if ($got_image) {
+        if ($line =~ /^(\d+)\s+\d+\s+\d+\s+(\d+)\s+\d+\s+\d+\s+cm$/) {
+          $width = $1;
+          $height = $2;
+        }
+        elsif ($line =~ /^\/Width\s(\d+)/) {
+          $width = $1;
+        }
+        elsif ($line =~ /^\/Height\s(\d+)/) {
+          $height = $1;
+        }
+        elsif ($line =~ m/\/Width\s(\d+)\/Height\s(\d+)/) {
+          $width = $1;
+          $height = $2;
+        }
+      }
+
+      # did pdf contain image data?
+      if ($got_image && $width && $height) {
+        $no_more_fuzzy = 1;
+        my $area = $width * $height;
+        $total_height += $height;
+        $total_width += $width;
+        $total_area += $area;
+        $pms->{pdfinfo}->{dems_pdf}->{"${height}x${width}"} = 1;
+        $pms->{'pdfinfo'}->{"count_pdf_images"} ++;
+        dbg("pdfinfo: Found image in PDF ".($name ? $name : '')." - $height x $width pixels ($area pixels sq.)");
+        $self->_set_tag($pms, 'PDFIMGDIM', "${height}x${width}");
+        $height=0; $width=0;  # reset and check for next image
+        $got_image = 0;
+      }
+
+      # [5310] dbg: pdfinfo: line=<</Producer(GPL Ghostscript 8.15)
+      # [5310] dbg: pdfinfo: line=/CreationDate(D:20070703144220)
+      # [5310] dbg: pdfinfo: line=/ModDate(D:20070703144220)
+      # [5310] dbg: pdfinfo: line=/Title(Microsoft Word - Document1)
+      # [5310] dbg: pdfinfo: line=/Creator(PScript5.dll Version 5.2)
+      # [5310] dbg: pdfinfo: line=/Author(colet)>>endobj
+      # or all on same line inside xml - v1.6+
+      # <</CreationDate(D:20070226165054-06'00')/Creator( Adobe Photoshop CS2 Windows)/Producer(Adobe Photoshop for Windows -- Image Conversion Plug-in)/ModDate(D:20070226165100-06'00')>>
+
+      if ($line =~ /\/Producer\s?\(([^\)\\]+)/) {
+        $producer = $1;
+      }
+      if ($line =~ /\/CreationDate\s?\(D\:(\d+)/) {
+        $created = $1;
+      }
+      if ($line =~ /\/ModDate\s?\(D\:(\d+)/) {
+        $modified = $1;
+      }
+      if ($line =~ /\/Title\s?\(([^\)\\]+)/) {
+        $title = $1;
+        # Title=\376\377\000w\000w\000n\000g
+        # Title=wwng
+        $title =~ s/\\\d{3}//g;
+      }
+      if ($line =~ /\/Creator\s?\(([^\)\\]+)/) {
+        $creator = $1;
+      }
+      if ($line =~ /\/Author\s?\(([^\)]+)/) {
+        $author = $1;
+        # Author=\376\377\000H\000P\000_\000A\000d\000m\000i\000n\000i\000s\000t\000r\000a\000t\000o\000r
+        # Author=HP_Administrator
+        $author =~ s/\\\d{3}//g;
+      }
+    }
+
+    # store the file name so we can check pdf_named() or pdf_name_match() later.
+    $pms->{pdfinfo}->{names_pdf}->{$name} = 1 if $name;
+
+    # store encrypted flag.
+    $pms->{pdfinfo}->{encrypted} = $encrypted;
+
+    # if we had multiple images in the pdf, we need to store the total HxW as well.
+    # If it was a single Image PDF, then this value will already be in the hash.
+    $pms->{pdfinfo}->{dems_pdf}->{"${total_height}x${total_width}"} = 1 if ($total_height && $total_width);;
+
+    if ($total_area) {
+      $pms->{pdfinfo}->{pc_pdf} = $total_area;
+      $self->_set_tag($pms, 'PDFIMGAREA', $total_area);
+      dbg("pdfinfo: Filename=$name Total HxW: $total_height x $total_width ($total_area area)") if ($total_area);
+    }
+
+    dbg("pdfinfo: Filename=$name Title=$title Author=$author Producer=$producer Created=$created Modified=$modified");
+
+    $md5 = uc(md5_hex($data)) if $data;
+    $fuzzy_md5 = uc(md5_hex($fuzzy_data)) if $fuzzy_data;
+    my $tags_md5 = uc(md5_hex($pdf_tags)) if $pdf_tags;
+
+    dbg("pdfinfo: MD5 results for ".($name ? $name : '')." - md5=".($md5 ? $md5 : '')." fuzzy1=".($fuzzy_md5 ? $fuzzy_md5 : '')." fuzzy2=".($tags_md5 ? $tags_md5 : ''));
+
+    # we dont need tags for these.
+    $pms->{pdfinfo}->{details}->{created} = $created if $created;
+    $pms->{pdfinfo}->{details}->{modified} = $modified if $modified;
+
+    if ($producer) {
+      $pms->{pdfinfo}->{details}->{producer} = $producer if $producer;
+      $self->_set_tag($pms, 'PDFPRODUCER', $producer);
+    }
+    if ($title) {
+      $pms->{pdfinfo}->{details}->{title} = $title;
+      $self->_set_tag($pms, 'PDFTITLE', $title);
+    }
+    if ($creator) {
+      $pms->{pdfinfo}->{details}->{creator} = $creator;
+      $self->_set_tag($pms, 'PDFCREATOR', $creator);
+    }
+    if ($author) {
+      $pms->{pdfinfo}->{details}->{author} = $author;
+      $self->_set_tag($pms, 'PDFAUTHOR', $author);
+    }
+    if ($md5) {
+      $pms->{pdfinfo}->{md5}->{$md5} = 1;
+      $self->_set_tag($pms, 'PDFMD5', $fuzzy_md5);
+    }
+    if ($fuzzy_md5) {
+      $pms->{pdfinfo}->{fuzzy_md5}->{$fuzzy_md5} = 1;
+      $self->_set_tag($pms, 'PDFMD5FUZZY1', $fuzzy_md5);
+    }
+    if ($tags_md5) {
+      $pms->{pdfinfo}->{fuzzy_md5}->{$tags_md5} = 1;
+      $self->_set_tag($pms, 'PDFMD5FUZZY2', $tags_md5);
+    }
+  },
+
+);
+
+# ----------------------------------------
+
+sub _set_tag {
+
+  my ($self, $pms, $tag, $value) = @_;
+
+  dbg("pdfinfo: set_tag called for $tag $value");
+  return unless ($tag && $value);
+
+  if (exists $pms->{tag_data}->{$tag}) {
+    $pms->{tag_data}->{$tag} .= " $value";  # append value
+  }
+  else {
+    $pms->{tag_data}->{$tag} = $value;
+  }
+}
+
+# ----------------------------------------
+
+sub _find_pdf_mime_parts {
+  my ($self,$pms) = @_;
+
+  # bail early if message does not have pdf parts
+  return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
+
+  # initialize
+  $pms->{'pdfinfo'}->{"pc_pdf"} = 0;
+  $pms->{'pdfinfo'}->{"count_pdf"} = 0;
+  $pms->{'pdfinfo'}->{"count_pdf_images"} = 0;
+
+  my @parts = $pms->{msg}->find_parts(qr@^(image|application)/(pdf|octet\-stream)$@, 1);
+  my $part_count = scalar @parts;
+
+  dbg("pdfinfo: Identified $part_count possible mime parts that need checked for PDF content");
+
+  # cache this so we can easily bail
+  $pms->{'pdfinfo'}->{'no_parts'} = 1 unless $part_count;
+
+  foreach my $p (@parts) {
+    my $type = $p->{'type'} =~ m@/([\w\-]+)$@;
+    my $name = $p->{'name'};
+
+    my $cte = lc $p->get_header('content-transfer-encoding') || '';
+
+    dbg("pdfinfo: found part, type=".($type ? $type : '')." file=".($name ? $name : '')." cte=".($cte ? $cte : '')."");
+
+    # make sure its a cte we support
+    next unless ($cte =~ /^(?:base64|quoted\-printable)$/);
+
+    # filename must end with .pdf, or application type can be pdf
+    # sometimes windows muas will wrap a pdf up inside a .dat file
+    # v0.8 - Added .fdf phoney PDF detection
+    next unless ($name =~ /\.[fp]df$/ || $type eq 'pdf');
+
+    # if we get this far, make sure type is pdf for sure (not octet-stream or anything else)
+    $type='pdf';
+
+    if ($type && exists $get_details{$type}) {
+       $get_details{$type}->($self, $pms, $p);
+       $pms->{'pdfinfo'}->{"count_$type"} ++;
+    }
+  }
+
+  $self->_set_tag($pms, 'PDFCOUNT',  $pms->{'pdfinfo'}->{"count_pdf"});
+  $self->_set_tag($pms, 'PDFIMGCOUNT', $pms->{'pdfinfo'}->{"count_pdf_images"});
+
+}
+
+
+# ----------------------------------------
+
+sub pdf_named {
+  my ($self,$pms,$body,$name) = @_;
+  return unless (defined $name);
+
+  # make sure we have image data read in.
+  if (!exists $pms->{'pdfinfo'}) {
+    $self->_find_pdf_mime_parts($pms);
+  }
+
+  return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
+
+  return 0 unless (exists $pms->{'pdfinfo'}->{"names_pdf"});
+  return 1 if (exists $pms->{'pdfinfo'}->{"names_pdf"}->{$name});
+  return 0;
+}
+
+# -----------------------------------------
+
+sub pdf_name_regex {
+  my ($self,$pms,$body,$re) = @_;
+  return unless (defined $re);
+
+  # make sure we have image data read in.
+  if (!exists $pms->{'pdfinfo'}) {
+    $self->_find_pdf_mime_parts($pms);
+  }
+
+  return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
+  return 0 unless (exists $pms->{'pdfinfo'}->{"names_pdf"});
+
+  my $hit = 0;
+  foreach my $name (keys %{$pms->{'pdfinfo'}->{"names_pdf"}}) {
+    my $eval = 'if (q{'.$name.'} =~  '.$re.') {  $hit = 1; } ';
+    eval $eval;
+    dbg("pdfinfo: error in regex $re - $@") if $@;
+    if ($hit) {
+      dbg("pdfinfo: pdf_name_regex hit on $name");
+      return 1;
+    }
+  }
+  return 0;
+
+}
+
+# -----------------------------------------
+
+sub pdf_is_encrypted {
+  my ($self,$pms,$body) = @_;
+
+  # make sure we have image data read in.
+  if (!exists $pms->{'pdfinfo'}) {
+    $self->_find_pdf_mime_parts($pms);
+  }
+
+  return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
+  return $pms->{'pdfinfo'}->{'encrypted'};
+}
+
+# -----------------------------------------
+
+sub pdf_count {
+  my ($self,$pms,$body,$min,$max) = @_;
+  return unless defined $min;
+
+  # make sure we have image data read in.
+  if (!exists $pms->{'pdfinfo'}) {
+    $self->_find_pdf_mime_parts($pms);
+  }
+
+  return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
+  return 0 unless (exists $pms->{'pdfinfo'}->{"count_pdf"});
+  return result_check($min, $max, $pms->{'pdfinfo'}->{"count_pdf"});
+
+}
+
+# -----------------------------------------
+
+sub pdf_image_count {
+  my ($self,$pms,$body,$min,$max) = @_;
+  return unless defined $min;
+
+  # make sure we have image data read in.
+  if (!exists $pms->{'pdfinfo'}) {
+    $self->_find_pdf_mime_parts($pms);
+  }
+
+  return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
+  return 0 unless (exists $pms->{'pdfinfo'}->{"count_pdf_images"});
+  return result_check($min, $max, $pms->{'pdfinfo'}->{"count_pdf_images"});
+
+}
+
+# -----------------------------------------
+
+sub pdf_pixel_coverage {
+  my ($self,$pms,$body,$min,$max) = @_;
+  return unless (defined $min);
+
+  # make sure we have image data read in.
+  if (!exists $pms->{'pdfinfo'}) {
+    $self->_find_pdf_mime_parts($pms);
+  }
+
+  return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
+  return 0 unless (exists $pms->{'pdfinfo'}->{"pc_pdf"});
+
+  # dbg("pdfinfo: pc_$type: $min, ".($max ? $max:'').", $type, ".$pms->{'pdfinfo'}->{"pc_pdf"});
+  return result_check($min, $max, $pms->{'pdfinfo'}->{"pc_pdf"});
+}
+
+# -----------------------------------------
+
+sub pdf_image_to_text_ratio {
+  my ($self,$pms,$body,$min,$max) = @_;
+  return unless (defined $min && defined $max);
+
+  # make sure we have image data read in.
+  if (!exists $pms->{'pdfinfo'}) {
+    $self->_find_pdf_mime_parts($pms);
+  }
+
+  return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
+  return 0 unless (exists $pms->{'pdfinfo'}->{"pc_pdf"});
+
+  # depending on how you call this eval (body vs rawbody),
+  # the $textlen will differ.
+  my $textlen = length(join('',@$body));
+
+  return 0 unless ( $textlen > 0 && exists $pms->{'pdfinfo'}->{"pc_pdf"} && $pms->{'pdfinfo'}->{"pc_pdf"} > 0);
+
+  my $ratio = $textlen / $pms->{'pdfinfo'}->{"pc_pdf"};
+  dbg("pdfinfo: image ratio=$ratio, min=$min max=$max");
+  return result_check($min, $max, $ratio, 1);
+}
+
+# -----------------------------------------
+
+sub pdf_is_empty_body {
+  my ($self,$pms,$body,$min) = @_;
+
+  $min ||= 0;  # default to 0 bytes
+
+  # make sure we have image data read in.
+  if (!exists $pms->{'pdfinfo'}) {
+    $self->_find_pdf_mime_parts($pms);
+  }
+
+  return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
+  return 0 unless $pms->{'pdfinfo'}->{"count_pdf"};
+
+  # check for cached result
+  return 1 if $pms->{'pdfinfo'}->{"no_body_text"};
+
+  shift @$body;  # shift body array removes line #1 -> subject line.
+
+  my $bytes = 0;
+  my $textlen = length(join('',@$body));
+  foreach my $line (@$body) {
+    next unless ($line =~ m/\S/);
+    next if ($line =~ m/^Subject/);
+    $bytes += length($line);
+  }
+
+  dbg("pdfinfo: is_empty_body = $bytes bytes");
+
+  if ($bytes == 0 || ($bytes <= $min)) {
+    $pms->{'pdfinfo'}->{"no_body_text"} = 1;
+    return 1;
+  }
+
+  # cache it and return 0
+  $pms->{'pdfinfo'}->{"no_body_text"} = 0;
+  return 0;
+}
+
+# -----------------------------------------
+
+sub pdf_image_size_exact {
+  my ($self,$pms,$body,$height,$width) = @_;
+  return unless (defined $height && defined $width);
+
+  # make sure we have image data read in.
+  if (!exists $pms->{'pdfinfo'}) {
+    $self->_find_pdf_mime_parts($pms);
+  }
+
+  return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
+  return 0 unless (exists $pms->{'pdfinfo'}->{"dems_pdf"});
+  return 1 if (exists $pms->{'pdfinfo'}->{"dems_pdf"}->{"${height}x${width}"});
+  return 0;
+}
+
+# -----------------------------------------
+
+sub pdf_image_size_range {
+  my ($self,$pms,$body,$minh,$minw,$maxh,$maxw) = @_;
+  return unless (defined $minh && defined $minw);
+
+  # make sure we have image data read in.
+  if (!exists $pms->{'pdfinfo'}) {
+    $self->_find_pdf_mime_parts($pms);
+  }
+
+  return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
+  return 0 unless (exists $pms->{'pdfinfo'}->{"dems_pdf"});
+
+  foreach my $dem ( keys %{$pms->{'pdfinfo'}->{"dems_pdf"}}) {
+    my ($h,$w) = split(/x/,$dem);
+    next if ($h < $minh);  # height less than min height
+    next if ($w < $minw);  # width less than min width
+    next if (defined $maxh && $h > $maxh);  # height more than max height
+    next if (defined $maxw && $w > $maxw);  # width more than max width
+
+    # if we make it here, we have a match
+    return 1;
+  }
+
+  return 0;
+}
+
+# -----------------------------------------
+
+sub pdf_match_md5 {
+
+  my ($self,$pms,$body,$md5) = @_;
+  return unless defined $md5;
+
+  my $uc_md5 = uc($md5);  # uppercase matches only
+
+  # make sure we have pdf data read in.
+  if (!exists $pms->{'pdfinfo'}) {
+    $self->_find_pdf_mime_parts($pms);
+  }
+
+  return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
+  return 0 unless (exists $pms->{'pdfinfo'}->{"md5"});
+  return 1 if (exists $pms->{'pdfinfo'}->{"md5"}->{$uc_md5});
+  return 0;
+}
+
+# -----------------------------------------
+
+sub pdf_match_fuzzy_md5 {
+
+  my ($self,$pms,$body,$md5) = @_;
+  return unless defined $md5;
+
+  my $uc_md5 = uc($md5);  # uppercase matches only
+
+  # make sure we have pdf data read in.
+  if (!exists $pms->{'pdfinfo'}) {
+    $self->_find_pdf_mime_parts($pms);
+  }
+
+  return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
+  return 0 unless (exists $pms->{'pdfinfo'}->{"fuzzy_md5"});
+  return 1 if (exists $pms->{'pdfinfo'}->{"fuzzy_md5"}->{$uc_md5});
+  return 0;
+}
+
+# -----------------------------------------
+
+sub pdf_match_details {
+  my ($self, $pms, $body, $detail, $regex) = @_;
+  return unless ($detail && $regex);
+
+  # make sure we have pdf data read in.
+  if (!exists $pms->{'pdfinfo'}) {
+    $self->_find_pdf_mime_parts($pms);
+  }
+
+  return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'});
+  return 0 unless (exists $pms->{'pdfinfo'}->{'details'});
+
+  my $check_value = $pms->{pdfinfo}->{details}->{$detail};
+  return unless $check_value;
+
+  my $hit = 0;
+  $check_value =~ s/[\{\}\\]//g;
+  my $eval = 'if (q{'.$check_value.'} =~ '.$regex.') { $hit = 1; }';
+  eval $eval;
+  dbg("pdfinfo: error in regex $regex - $@") if $@;
+  if ($hit) {
+    dbg("pdfinfo: pdf_match_details $detail $regex matches $check_value");
+    return 1;
+  }
+  return 0;
+}
+
+# -----------------------------------------
+
+sub result_check {
+  my ($min, $max, $value, $nomaxequal) = @_;
+  return 0 unless defined $value;
+  return 0 if ($value < $min);
+  return 0 if (defined $max && $value > $max);
+  return 0 if (defined $nomaxequal && $nomaxequal && $value == $max);
+  return 1;
+}
+
+# -----------------------------------------
+
+1;
+

Propchange: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/PDFInfo.pm
------------------------------------------------------------------------------
    svn:executable = *