You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by ax...@apache.org on 2014/12/02 21:15:03 UTC

svn commit: r1643000 - /spamassassin/trunk/rules/20_pdfinfo.cf

Author: axb
Date: Tue Dec  2 20:15:03 2014
New Revision: 1643000

URL: http://svn.apache.org/r1643000
Log:
saving PDFinfo rules from 2007!

Added:
    spamassassin/trunk/rules/20_pdfinfo.cf   (with props)

Added: spamassassin/trunk/rules/20_pdfinfo.cf
URL: http://svn.apache.org/viewvc/spamassassin/trunk/rules/20_pdfinfo.cf?rev=1643000&view=auto
==============================================================================
--- spamassassin/trunk/rules/20_pdfinfo.cf (added)
+++ spamassassin/trunk/rules/20_pdfinfo.cf Tue Dec  2 20:15:03 2014
@@ -0,0 +1,314 @@
+# SpamAssassin rules file: Pdfinfo rules
+#
+# Please don't modify this file as your changes will be overwritten with
+# the next update. Use @@LOCAL_RULES_DIR@@/local.cf instead.
+# See 'perldoc Mail::SpamAssassin::Conf' for details.
+#
+# <@LICENSE>
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at:
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </...@LICENSE>
+#
+###########################################################################
+
+# 2014-12-02 - axb
+# Info and disabled rules kept for historical & documentation reasons
+# Updated rules may be added
+#
+# Original File: pdfinfo.cf
+# Original Version: 0.6
+# Info: $Id: pdfinfo.cf 895 2007-07-27 10:31:08Z alexb $
+# Created: 2007-06-25
+# Modified: 2007-07-19
+# Original / Defunct Site URL: http://www.rulesemporium.com/plugins.htm#PDFinfo
+# Author: Dallas Engelken (aka GMD :-)
+# Rules contributed by Alex Broens
+# Requires: PDFInfo.pm plugin
+# Description: This plugin/ruleset combination will help you alleviate the new
+#              PDF based stock spam which began to appear mid-June, 2007.
+#
+#
+# Changes:
+#
+#   0.6 - added easypdf producer rule and more no body text metas
+#       - tags support added, see USING TAGS below.
+#   0.5 - added fuzzy test 7
+#   0.4 - added new fuzzy for encyprted pdf image spams.
+#       - added rule to check for encryption
+#   0.3 - added rules based on the new pdf_match_details() function
+#       - added additional fuzzy md5 rules
+#       - disabled static md5 rules as they are no longer hitting.
+#   0.2 - added static md5 to hit full page stock spam.
+#   0.1 - initial ruleset.
+#
+
+############################################
+# USING TAGS
+############################################
+
+# The follow tags can be defined in an add_header line
+#
+# _PDFCOUNT_    - total number of pdf mime parts in the email
+# _PDFIMGCOUNT_ - total number of images found inside pdf mime parts
+# _PDFVERSION_  - PDF Version, space seperated if there are > 1 pdf attachments
+# _PDFNAME_     - Filenames as found in the mime headers of PDF parts
+# _PDFPRODUCER_ - Producer/Application that created the PDF(s)
+# _PDFAUTHOR_   - Author of the PDF
+# _PDFCREATOR_  - Creator/Program that created the PDF(s)
+# _PDFTITLE_    - Title of the PDF File, if available
+# _PDFIMGDIM_   - If PDF Contains images, the dimensions of them will be put here
+# _PDFIMGAREA_  - The total area of all combined images inside the PDF(s)
+# _PDFMD5_      - MD5 checksum of PDF(s) - space seperated
+# _PDFMD5FUZZY1_- Fuzzy1 MD5 checksum of PDF(s) - space seperated
+# _PDFMD5FUZZY2_- Fuzzy2 MD5 checksum of PDF(s) - space seperated
+#
+# Example add_header lines
+#
+# add_header all PDF-Info pdf=_PDFCOUNT_, pdfimg=_PDFIMGCOUNT_, ver=_PDFVERSION_, name=_PDFNAME_
+# add_header all PDF-Details producer=_PDFPRODUCER_, author=_PDFAUTHOR_, creator=_PDFCREATOR_, title=_PDFTITLE_
+# add_header all PDF-ImageInfo dim=_PDFIMGDIM_, area=_PDFIMGAREA_
+# add_header all PDF-Md5 md5=_PDFMD5_, fuzzy1=_PDFMD5FUZZY1_, fuzzy2=_PDFMD5FUZZY2_
+#
+
+############################################
+# GENERIC RULE EXAMPLES SHOWING EVAL USAGE
+############################################
+
+# you can match by name
+# body          MY_TEST_PDF             eval:pdf_named('mytest.pdf')
+
+# or you can write a regex to match dynamic file names.
+# body          MY_TEST_PDF             eval:pdf_name_regex('/^(?:my|your)test\.pdf$/')
+
+# you can make it case insensitive by using modifiers
+# body          PDF_IMGXXXXX            eval:pdf_name_regex('/^IMG\D+\.\.PDF$/i')
+
+# you can do exact image size matches
+# body          PDF_DEMS_150_400        eval:pdf_image_size_exact(150,400)
+
+# you can do image to text, or image to html ratios
+# rawbody       PDF_TO_HTML_RATIO       eval:pdf_image_to_text_ratio(0.000, 0.015)
+# body          PDF_TO_TEXT_RATIO       eval:pdf_image_to_text_ratio(0.000, 0.008)
+
+# you can do minimum demension matches
+# body          PDF_SIZE_RANGE_1        eval:pdf_image_size_range(300,300)
+
+# you can do ranged demension matches
+# body          PDF_SIZE_RANGE_2        eval:pdf_image_size_range(200, 300, 250, 350)
+
+# you can count the number of pdf mime partts
+# body          PDF_MIME_COUNT_1        eval:pdf_count(1,1)
+# body          PDF_MIME_COUNT_2_PLUS   eval:pdf_count(2)
+
+# you can count the number of images inside the pdfs
+# body          PDF_IMG_COUNT_1         eval:pdf_image_count(1,1)
+# body          PDF_IMG_COUNT_2_PLUS    eval:pdf_image_count(2)
+
+# you can determine pixel coverage
+# body          PDF_AREA_SMALL          eval:pdf_pixel_coverage(1,100000)
+
+
+# match a md5 or fuzzy md5 signature of the pdf
+
+# body          PDF_BAD_MD5             eval:pdf_match_md5('C359F8F89B290DA99DC997ED50117CDF')
+# body          PDF_BAD_FUZZY           eval:pdf_match_fuzzy_md5('7340821445D975EEF6F5BDE2EC257900')
+
+# Now you can match against certain details if they are found in the PDF.
+# A regex match is used on the value specified, so if you want to do an
+# exact match, use anchors  ^value$
+#
+# body          GMD_AUTHOR_MOBILE       eval:pdf_match_details('author','/^mobile$/')
+# body          GMD_PRODUCER_GPL        eval:pdf_match_details('producer','/(?i)^gpl ghostscript/')
+# body          GMD_CREATOR_PSCRIPT5    eval:pdf_match_details('creator','/^PScript5/')
+# body          GMD_TITLE_WORD_DOC1     eval:pdf_match_details('title','/^Microsoft Word \- Document1$/)
+# body          GMD_CREATED_JULY07      eval:pdf_match_details('created','/^200707/')
+# body          GMD_MODIFIED_JULY07     eval:pdf_match_details('modified','/^200707/')
+
+ifplugin Mail::SpamAssassin::Plugin::PDFInfo
+
+#######################################
+# DISABLED RULES, ENABLE IF YOU WANT
+#######################################
+
+# Small area
+# Disabled - Hits Ham
+# body          GMD_PDF_SMALL_AREA      eval:pdf_pixel_coverage(1,100000)
+# describe      GMD_PDF_SMALL_AREA      PDF Area covers 150k pixels or less
+# score         GMD_PDF_SMALL_AREA      0.75
+# counts        GMD_PDF_SMALL_AREA      51s/15h of 10615 corpus (5652s/4963h AxB) 06/25/07
+
+# NOTE - people do send pdf's without message bodies!
+# Disabled - Hits Ham
+# body          GMD_PDF_NO_TXT          eval:pdf_image_to_text_ratio(0.000, 0.005)
+# describe      GMD_PDF_NO_TXT          Low rawbody to pixel area ratio
+# score         GMD_PDF_NO_TXT          0.01
+# counts        GMD_PDF_NO_TXT          64s/3h of 10615 corpus (5652s/4963h AxB) 06/25/07
+
+####################################
+# HERE ARE THE LIVE RULES
+####################################
+
+
+
+######################################################################################################
+# pdf image dimensions
+
+# thin horizontal, common stox.
+#body            GMD_PDF_HORIZ           eval:pdf_image_size_range(100, 450, 240, 800)
+#describe        GMD_PDF_HORIZ           Contains pdf 100-240 (high) x 450-800 (wide)
+#score           GMD_PDF_HORIZ           0.25
+# counts        GMD_PDF_HORIZ           135s/0h of 6132 corpus (4555s/1577h AxB-MANUAL) 07/11/07
+# counts        GMD_PDF_HORIZ           278s/0h of 34051 corpus (33259s/792h AxB2-TRAPS) 07/13/07
+
+# near square, and small.  common stox.
+#body            GMD_PDF_SQUARE          eval:pdf_image_size_range(180, 180, 360, 360)
+#describe        GMD_PDF_SQUARE          Contains pdf 180-360 (high) x 180-360 (wide)
+#score           GMD_PDF_SQUARE          0.50
+# counts        GMD_PDF_SQUARE          36s/0h of 6132 corpus (4555s/1577h AxB-MANUAL) 07/11/07
+# counts        GMD_PDF_SQUARE          46s/0h of 34051 corpus (33259s/792h AxB2-TRAPS) 07/13/07
+
+# thin vertical, very tall.  common stox.
+#body            GMD_PDF_VERT            eval:pdf_image_size_range(450, 100, 800, 240)
+#describe        GMD_PDF_VERT            Contains pdf 450-800 (high) x 100-240 (wide)
+#score           GMD_PDF_VERT            0.90
+# counts        GMD_PDF_VERT            24s/0h of 6132 corpus (4555s/1577h AxB-MANUAL) 07/11/07
+# counts        GMD_PDF_VERT            10s/0h of 11773 corpus (10988s/785h AxB2-TRAPS) 07/11/07
+
+######################################################################################################
+# static checksums
+
+# all static md5 spam runs are complete as of 7/11
+# if there are more, we'll add new rules.
+
+# removed fuzzy rules dated 2007
+# Get fuzzy info:
+# cat msg.eml | spamassassin --debug pdfinfo  2>&1 | grep fuzzy 2>&1
+
+# sample rules ONLY
+# fuzzy checksum for bad stox
+#body            GMD_PDF_FUZZY1_T1       eval:pdf_match_fuzzy_md5('57EBC1FFB1A24CC14AE23E1E227C3484')
+#describe        GMD_PDF_FUZZY1_T1       Fuzzy MD5 Match 57EBC1FFB1A24CC14AE23E1E227C3484
+#score           GMD_PDF_FUZZY1_T1       0.001
+
+# same as rule above using fuzzy md5 of pdf structure
+#body            GMD_PDF_FUZZY2_T1       eval:pdf_match_fuzzy_md5('653C8AA9FDFD03D382523488058360A2')
+#describe        GMD_PDF_FUZZY2_T1       Fuzzy MD5 Match 653C8AA9FDFD03D382523488058360A2
+#score           GMD_PDF_FUZZY2_T1       0.001
+
+
+######################################################################################################
+# pdf_match_details()
+
+# from embedded link spam
+#body            GMD_AUTHOR_COLET        eval:pdf_match_details('author','/^colet$/')
+#describe        GMD_AUTHOR_COLET        PDF author was 'colet'
+#score           GMD_AUTHOR_COLET        4.50
+# counts        GMD_AUTHOR_COLET        1s/0h of 10767 corpus (9986s/781h AxB2-TRAPS) 07/11/07
+# counts        GMD_AUTHOR_COLET        2s/0h of 6132 corpus (555s/1577h AxB-MANUAL) 07/11/07
+
+# from full page pdf stock spammer.
+#body            GMD_AUTHOR_MOBILE       eval:pdf_match_details('author','/^mobile$/')
+#describe        GMD_AUTHOR_MOBILE       PDF author was 'mobile'
+#score           GMD_AUTHOR_MOBILE       2.75
+# counts        GMD_AUTHOR_MOBILE       2s/0h of 6132 corpus (555s/1577h AxB-MANUAL) 07/11/07
+# counts        GMD_AUTHOR_MOBILE       55s/0h of 10767 corpus (9986s/781h AxB2-TRAPS) 07/11/07
+
+# txt only stock spam
+#body            GMD_AUTHOR_OOO          eval:pdf_match_details('author','/^openofficeuser$/')
+#describe        GMD_AUTHOR_OOO          PDF author was 'openofficeuser'
+#score           GMD_AUTHOR_OOO          1.75
+# counts        GMD_AUTHOR_OOO          1s/0h of 10767 corpus (9986s/781h AxB2-TRAPS) 07/11/07
+# counts        GMD_AUTHOR_OOO          118s/0h of 6132 corpus (555s/1577h AxB-MANUAL) 07/11/07
+
+# txt only stock spam
+#body            GMD_AUTHOR_HPADMIN      eval:pdf_match_details('author','/^HP_Administrator/')
+#describe        GMD_AUTHOR_HPADMIN      PDF author was 'HP_Administrator'
+#score           GMD_AUTHOR_HPADMIN      0.25
+# counts        GMD_AUTHOR_HPADMIN      105s/0h of 6132 corpus (4555s/1577h AxB-MANUAL) 07/11/07
+# counts        GMD_AUTHOR_HPADMIN      27s/0h of 11773 corpus (10988s/785h AxB2-TRAPS) 07/11/07
+
+# generic rule for software used to produce the pdf.
+#body            GMD_PRODUCER_GPL        eval:pdf_match_details('producer','/^(?:gnu|gpl) ghostscript/i')
+#describe        GMD_PRODUCER_GPL        PDF producer was GPL Ghostscript
+#score           GMD_PRODUCER_GPL        0.25
+# counts        GMD_PRODUCER_GPL        227s/0h of 6132 corpus (555s/1577h AxB-MANUAL) 07/11/07
+# counts        GMD_PRODUCER_GPL        85s/0h of 10767 corpus (9986s/781h AxB2-TRAPS) 07/11/07
+
+# generic rule for software used to produce the pdf.
+#body            GMD_PRODUCER_POWERPDF   eval:pdf_match_details('producer','/^PowerPdf 0\./')
+#describe        GMD_PRODUCER_POWERPDF   PDF producer was PowerPDF
+#score           GMD_PRODUCER_POWERPDF   0.25
+# counts        GMD_PRODUCER_POWERPDF   0s/0h of 10767 corpus (9986s/781h AxB2-TRAPS) 07/11/07
+# counts        GMD_PRODUCER_POWERPDF   0s/0h of 5641 corpus (4064s/1577h AxB-MANUAL) 07/11/07
+
+# producer is bcl
+#body            GMD_PRODUCER_EASYPDF    eval:pdf_match_details('producer','/^BCL easyPDF/')
+#describe        GMD_PRODUCER_EASYPDF    PDF producer was BCL easyPDF
+#score           GMD_PRODUCER_EASYPDF    0.25
+
+# simple check for encryption used inside pdf.
+# recommend meta with something else...
+#body            GMD_PDF_ENCRYPTED       eval:pdf_is_encrypted()
+#describe        GMD_PDF_ENCRYPTED       Attached PDF is encrypted
+#score           GMD_PDF_ENCRYPTED       0.60
+# counts        GMD_PDF_ENCRYPTED       13s/0h of 34051 corpus (33259s/792h AxB2-TRAPS) 07/13/07
+
+# simple check for empty msg body when there is one or more pdf attachments present.
+#body            GMD_PDF_EMPTY_BODY      eval:pdf_is_empty_body()
+#describe        GMD_PDF_EMPTY_BODY      Attached PDF with empty message body
+#score           GMD_PDF_EMPTY_BODY      0.25
+# counts        GMD_PDF_EMPTY_BODY      1638s/20h of 27034 corpus (24636s/2398h AxB-MANUAL) 07/19/07
+
+######################################################################################################
+# metas
+#meta            __GMD_PDF_CHECKSUM      ( GMD_PDF_FUZZY1_T1 || GMD_PDF_FUZZY2_T1 || GMD_PDF_FUZZY2_T2 || GMD_PDF_FUZZY2_T3 || GMD_PDF_FUZZY2_T4 || GMD_PDF_FUZZY2_T5 || GMD_PDF_FUZZY2_T6 || GMD_PDF_FUZZY2_T7 ||GMD_PDF_FUZZY2_T9 || GMD_PDF_FUZZY2_T10 || GMD_PDF_FUZZY2_T11 || GMD_PDF_FUZZY2_T12 )
+#meta            __GMD_PDF_DETAIL        ( GMD_AUTHOR_COLET || GMD_AUTHOR_MOBILE || GMD_AUTHOR_OOO || GMD_AUTHOR_HPADMIN || GMD_PRODUCER_GPL || GMD_PRODUCER_POWERPDF || GMD_PRODUCER_EASYPDF )
+#meta            __GMD_PDF_DIMS          ( GMD_PDF_VERT || GMD_PDF_HORIZ || GMD_PDF_SQUARE )
+#meta            __GMD_PDF_PRODUCERS     ( GMD_PRODUCER_GPL || GMD_PRODUCER_POWERPDF || GMD_PRODUCER_EASYPDF )
+
+# rule hits ham by itself, so use just to meta.
+#body            __GMD_PDF_NO_TXT        eval:pdf_image_to_text_ratio(0.000, 0.005)
+
+# meta checksum hit with image dimensions
+#meta            GMD_PDF_STOX_M1         ( __GMD_PDF_CHECKSUM && __GMD_PDF_DIMS)
+#describe        GMD_PDF_STOX_M1         PDF Stox spam
+#score           GMD_PDF_STOX_M1         3.25
+# counts        GMD_PDF_STOX_M1         159s/0h of 6132 corpus (555s/1577h AxB-MANUAL) 07/11/07
+# counts        GMD_PDF_STOX_M1         40s/0h of 11773 corpus (10988s/785h AxB2-TRAPS) 07/11/07
+
+# meta checksum hit to pdf details
+#meta            GMD_PDF_STOX_M2         ( __GMD_PDF_CHECKSUM && __GMD_PDF_DETAIL )
+#describe        GMD_PDF_STOX_M2         PDF Stox spam
+#score           GMD_PDF_STOX_M2         2.95
+# counts        GMD_PDF_STOX_M2         223s/0h of 6132 corpus (555s/1577h AxB-MANUAL) 07/11/07
+# counts        GMD_PDF_STOX_M2         29s/0h of 10767 corpus (9986s/781h AxB2-TRAPS) 07/11/07
+
+# meta dimensions and encryption
+#meta            GMD_PDF_STOX_M3         ( __GMD_PDF_DIMS && GMD_PDF_ENCRYPTED )
+#describe        GMD_PDF_STOX_M3         PDF Stox spam
+#score           GMD_PDF_STOX_M3         2.25
+# counts        GMD_PDF_STOX_M3         12s/0h of 34051 corpus (33259s/792h AxB2-TRAPS) 07/13/07
+
+# meta checksum with no text
+#meta            GMD_PDF_STOX_M4         ( __GMD_PDF_CHECKSUM && (__GMD_PDF_NO_TXT || GMD_PDF_EMPTY_BODY))
+#describe        GMD_PDF_STOX_M4         PDF Stox spam
+#score           GMD_PDF_STOX_M4         2.95
+
+# meta no body text along with automated pdf production.
+#meta            GMD_PDF_STOX_M5         ( __GMD_PDF_PRODUCERS &&  (__GMD_PDF_NO_TXT || GMD_PDF_EMPTY_BODY))
+#describe        GMD_PDF_STOX_M5         PDF Stox Spam
+#score           GMD_PDF_STOX_M5         1.00
+
+endif
\ No newline at end of file

Propchange: spamassassin/trunk/rules/20_pdfinfo.cf
------------------------------------------------------------------------------
    svn:executable = *