You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by he...@apache.org on 2022/03/05 13:51:29 UTC
svn commit: r1898622 - /spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/PDFInfo.pm
Author: hege
Date: Sat Mar 5 13:51:29 2022
New Revision: 1898622
URL: http://svn.apache.org/viewvc?rev=1898622&view=rev
Log:
Add some string/tag and uri size limits, improve uri parsing
Modified:
spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/PDFInfo.pm
Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/PDFInfo.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/PDFInfo.pm?rev=1898622&r1=1898621&r2=1898622&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/PDFInfo.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/PDFInfo.pm Sat Mar 5 13:51:29 2022
@@ -263,6 +263,7 @@ sub _get_pdf_details {
my $no_more_fuzzy = 0;
my $got_image = 0;
my $encrypted = 0;
+ my %uris;
while ($data =~ /([^\n]+)/g) {
# dbg("pdfinfo: line=$1");
@@ -327,10 +328,14 @@ sub _get_pdf_details {
}
# XXX some pdf have uris but are stored inside binary data
- if ($line =~ /\/S\s?\/URI\s?\/URI\s?\(([^\)\\]+)\)\s?/) {
- my $location = $1;
- dbg("pdfinfo: found URI: $location");
- $pms->add_uri_detail_list($location);
+ if (keys %uris < 20 && $line =~ /(?:\/S\s{0,2}\/URI\s{0,2}|^\s*)\/URI\s{0,2}( \( .*? (?<!\\) \) | < [^>]* > )/x) {
+ my $location = _parse_string($1);
+ next unless index($location, '.') > 0; # ignore some binary mess
+ if (!exists $uris{$location}) {
+ $uris{$location} = 1;
+ dbg("pdfinfo: found URI: $location");
+ $pms->add_uri_detail_list($location);
+ }
}
# [5310] dbg: pdfinfo: line=<</Producer(GPL Ghostscript 8.15)
@@ -344,35 +349,35 @@ sub _get_pdf_details {
# Or hex values
# /Creator<FEFF005700720069007400650072>
if ($line =~ /\/Author\s{0,2}( \( .*? (?<!\\) \) | < [^>]* > )/x) {
- my $author = _clean_property($1);
+ my $author = _parse_string($1);
dbg("pdfinfo: found property Author=$author");
$pms->{pdfinfo}->{details}->{author}->{$author} = 1;
_set_tag($pms, 'PDFAUTHOR', $author);
}
if ($line =~ /\/Creator\s{0,2}( \( .*? (?<!\\) \) | < [^>]* > )/x) {
- my $creator = _clean_property($1);
+ my $creator = _parse_string($1);
dbg("pdfinfo: found property Creator=$creator");
$pms->{pdfinfo}->{details}->{creator}->{$creator} = 1;
_set_tag($pms, 'PDFCREATOR', $creator);
}
if ($line =~ /\/CreationDate\s{0,2}\(D\:(\d+)/) {
- my $created = _clean_property($1);
+ my $created = _parse_string($1);
dbg("pdfinfo: found property Created=$created");
$pms->{pdfinfo}->{details}->{created}->{$created} = 1;
}
if ($line =~ /\/ModDate\s{0,2}\(D\:(\d+)/) {
- my $modified = _clean_property($1);
+ my $modified = _parse_string($1);
dbg("pdfinfo: found property Modified=$modified");
$pms->{pdfinfo}->{details}->{modified}->{$modified} = 1;
}
if ($line =~ /\/Producer\s{0,2}( \( .*? (?<!\\) \) | < [^>]* > )/x) {
- my $producer = _clean_property($1);
+ my $producer = _parse_string($1);
dbg("pdfinfo: found property Producer=$producer");
$pms->{pdfinfo}->{details}->{producer}->{$producer} = 1;
_set_tag($pms, 'PDFPRODUCER', $producer);
}
if ($line =~ /\/Title\s{0,2}( \( .*? (?<!\\) \) | < [^>]* > )/x) {
- my $title = _clean_property($1);
+ my $title = _parse_string($1);
dbg("pdfinfo: found property Title=$title");
$pms->{pdfinfo}->{details}->{title}->{$title} = 1;
_set_tag($pms, 'PDFTITLE', $title);
@@ -410,7 +415,7 @@ sub _get_pdf_details {
}
}
-sub _clean_property {
+sub _parse_string {
local $_ = shift;
# Anything inside < > is hex encoded
if (/^</) {
@@ -431,17 +436,21 @@ sub _clean_property {
# Title(Foo \(bar\))
s/\\([()\\])/$1/g;
}
- return $_;
+ # Limit to some sane length
+ return substr($_, 0, 256);
}
sub _set_tag {
my ($pms, $tag, $value) = @_;
return unless defined $value && $value ne '';
- dbg("pdfinfo: set_tag called for $tag $value");
+ dbg("pdfinfo: set_tag called for $tag: $value");
if (exists $pms->{tag_data}->{$tag}) {
- $pms->{tag_data}->{$tag} .= ' '.$value; # append value
+ # Limit to some sane length
+ if (length($pms->{tag_data}->{$tag}) < 2048) {
+ $pms->{tag_data}->{$tag} .= ' '.$value; # append value
+ }
}
else {
$pms->{tag_data}->{$tag} = $value;