You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2006/10/25 18:15:35 UTC
svn commit: r467701 [2/2] - in /spamassassin/branches/jm_re2c_hacks: ./ build/ build/automc/ lib/ lib/Mail/SpamAssassin/ lib/Mail/SpamAssassin/Bayes/ lib/Mail/SpamAssassin/BayesStore/ lib/Mail/SpamAssassin/Conf/ lib/Mail/SpamAssassin/Message/ lib/Mail/...

Modified: spamassassin/branches/jm_re2c_hacks/rules/20_head_tests.cf
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/rules/20_head_tests.cf?view=diff&rev=467701&r1=467700&r2=467701
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/rules/20_head_tests.cf (original)
+++ spamassassin/branches/jm_re2c_hacks/rules/20_head_tests.cf Wed Oct 25 09:15:31 2006
@@ -27,8 +27,551 @@
 
 ###########################################################################
 
+# partial messages; currently-theoretical attack
+# unsurprisingly this hits 0/0 right now.
+header FRAGMENTED_MESSAGE	Content-Type =~ /\bmessage\/partial/i
+describe FRAGMENTED_MESSAGE	Partial message
+tflags FRAGMENTED_MESSAGE       userconf
+
+# this is also mostly-theoretical, so allow 0 hits
+header HEAD_LONG                eval:check_msg_parse_flags('truncated_header')
+describe HEAD_LONG              Message headers are very long
+tflags HEAD_LONG                userconf
+
+###########################################################################
+
+header FROM_BLANK_NAME		From =~ /(?:\s|^)"" <\S+>/i
+describe FROM_BLANK_NAME	From: contains empty name
+
+###########################################################################
+# numeric address rules, these are written to avoid overlap with each other
+
+header __FROM_ENDS_IN_NUMS	From:addr =~ /\D\d{8,}\@/i
+
+header FROM_STARTS_WITH_NUMS	From:addr =~ /^\d{6,}\S+\@/i
+describe FROM_STARTS_WITH_NUMS	From: starts with many numbers
+
+# don't match US/Canada phone numbers: 10 digits optionally preceded by a "1"
+header __FROM_ALL_NUMS		From:addr =~ /^(?:\d{1,9}|[02-9]\d{10}|\d{12,})@/
+
+###########################################################################
+
+header FROM_OFFERS		From:addr =~ /\@\S*offers(?![eo]n\b)/i
+describe FROM_OFFERS		From address is "at something-offers"
+
+header FROM_NO_USER		From =~ /(?:^\@|<\@| \@[^\)<]*$|<>)/ [if-unset: unset@unset.unset]
+describe FROM_NO_USER		From: has no local-part before @ sign
+
+# also 100% valid
+header FAKED_UNDISC_RECIPS	To =~ /undisclosed[_ ]*recipient(?:s[^:]|[^s])/i
+describe FAKED_UNDISC_RECIPS	Faked To "Undisclosed-Recipients"
+
+header PLING_QUERY		Subject =~ /\?.*!|!.*\?/
+describe PLING_QUERY		Subject has exclamation mark and question mark
+
+
+header MSGID_SPAM_99X9XX99	MESSAGEID =~ /^<\d\d\d\d\d\d[a-z]\d[a-z][a-z]\d\d\$[a-z][a-z][a-z]\d\d\d\d\d\$\d\d\d\d\d\d\d\d\@/
+describe MSGID_SPAM_99X9XX99	Spam tool Message-Id: (99x9xx99 variant)
+
+header MSGID_SPAM_ALPHA_NUM	MESSAGEID =~ /<[A-Z]{7}-000[0-9]{10}\@[a-z]*>/
+describe MSGID_SPAM_ALPHA_NUM	Spam tool Message-Id: (alpha-numeric variant)
+
+header MSGID_SPAM_CAPS		Message-ID =~ /^\s*<?[A-Z]+\@(?!(?:mailcity|whowhere)\.com)/
+describe MSGID_SPAM_CAPS	Spam tool Message-Id: (caps variant)
+
+header MSGID_SPAM_LETTERS	Message-Id =~ /<[a-z]{5,}\@(\S+\.)+\S+>/
+describe MSGID_SPAM_LETTERS	Spam tool Message-Id: (letters variant)
+
+
+header MSGID_NO_HOST            MESSAGEID =~ /\@>(?:$|\s)/m
+describe MSGID_NO_HOST 		Message-Id has no hostname
+
+# negative lookahead exempts this MUA from circa 1997-2000 
+# X-Mailer: Microsoft Outlook Express 4.71.1712.3
+# Message-ID: <01...@andrew>
+header __MSGID_DOLLARS_OK	MESSAGEID =~ /<[0-9a-f]{4,}\$[0-9a-f]{4,}\$[0-9a-f]{4,}\@\S+>/m
+header __MSGID_DOLLARS_MAYBE	MESSAGEID =~ /<\w{4,}\$\w{4,}\$(?!localhost)\w{4,}\@\S+>/mi
+meta MSGID_DOLLARS_RANDOM	__MSGID_DOLLARS_MAYBE && !__MSGID_DOLLARS_OK
+
+# bit of a ratware rule, but catches a bit more than just the one ratware
+header __MSGID_RANDY		Message-ID =~ /<[a-z\d][a-z\d\$-]{10,29}[a-z\d]\@[a-z\d][a-z\d.]{3,12}[a-z\d]>/
+# heuristic to eliminate most good Message-ID formats
+header __MSGID_OK_HEX		Message-ID =~ /\b[a-f\d]{8}\b/
+header __MSGID_OK_DIGITS	Message-ID =~ /\d{10}/
+header __MSGID_OK_HOST		Message-ID =~ /\@(?:\D{2,}|(?:\d{1,3}\.){3}\d{1,3})>/
+meta MSGID_RANDY	(__MSGID_RANDY && !(__MSGID_OK_HEX || __MSGID_OK_DIGITS || __MSGID_OK_HOST))
+describe MSGID_RANDY		Message-Id has pattern used in spam
+
+# bug 3395
+header MSGID_YAHOO_CAPS		Message-ID =~ /<[A...@yahoo.com>/
+describe MSGID_YAHOO_CAPS	Message-ID has ALLCAPS@yahoo.com
+
+###########################################################################
+
+header   __AT_AOL_MSGID		MESSAGEID =~ /\@aol\.com\b/i
+header   __FROM_AOL_COM		From =~ /\@aol\.com\b/i
+meta     FORGED_MSGID_AOL	(__AT_AOL_MSGID && !__FROM_AOL_COM)
+describe FORGED_MSGID_AOL	Message-ID is forged, (aol.com)
+
+header   __AT_EXCITE_MSGID	MESSAGEID =~ /\@excite\.com\b/i
+header   __MY_RCVD_EXCITE	Received =~ /\.excite\.com\b/i
+meta     FORGED_MSGID_EXCITE	(__AT_EXCITE_MSGID && !__MY_RCVD_EXCITE)
+describe FORGED_MSGID_EXCITE	Message-ID is forged, (excite.com)
+
+header   __AT_HOTMAIL_MSGID	MESSAGEID =~ /\@hotmail\.com\b/i
+header   __FROM_HOTMAIL_COM	From =~ /\@hotmail\.com\b/i
+meta     FORGED_MSGID_HOTMAIL	(__AT_HOTMAIL_MSGID && (!__FROM_HOTMAIL_COM && !__FROM_MSN_COM && !__FROM_YAHOO_COM))
+describe FORGED_MSGID_HOTMAIL	Message-ID is forged, (hotmail.com)
+
+header   __AT_MSN_MSGID		MESSAGEID =~ /\@msn\.com\b/i
+header   __FROM_MSN_COM		From =~ /\@msn\.com\b/i
+meta     FORGED_MSGID_MSN	(__AT_MSN_MSGID && (!__FROM_MSN_COM && !__FROM_HOTMAIL_COM && !__FROM_YAHOO_COM))
+describe FORGED_MSGID_MSN	Message-ID is forged, (msn.com)
+
+header   __AT_YAHOO_MSGID	MESSAGEID =~ /\@yahoo\.com\b/i
+header   __FROM_YAHOO_COM	From =~ /\@yahoo\.com\b/i
+meta     FORGED_MSGID_YAHOO	(__AT_YAHOO_MSGID && !__FROM_YAHOO_COM)
+describe FORGED_MSGID_YAHOO	Message-ID is forged, (yahoo.com)
+
+###########################################################################
+
+header __MSGID_BEFORE_RECEIVED	ALL =~ /\nMessage-Id:.*\nReceived:/si
+header __MSGID_BEFORE_OKAY	Message-Id =~ /\@[a-z0-9.-]+\.(?:yahoo|wanadoo)(?:\.[a-z]{2,3}){1,2}>/
+meta MSGID_FROM_MTA_HEADER	(__MSGID_BEFORE_RECEIVED && !__MSGID_BEFORE_OKAY)
+describe MSGID_FROM_MTA_HEADER	Message-Id was added by a relay
+
+header MSGID_FROM_MTA_HOTMAIL	Message-Id =~ /<MC\d{1,2}-F{1,2}\w{21,22}\@\S*hotmail\.com>/
+describe MSGID_FROM_MTA_HOTMAIL	Message-Id was added by a hotmail.com relay
+
+header MSGID_LONG		MESSAGEID =~ /<.{160,}>|<.{140,}\@|\@.{55,}>/m
+describe MSGID_LONG		Message-ID is unusually long
+
+header MSGID_SHORT		MESSAGEID =~ /^.{1,15}$|<.{0,4}\@/m
+describe MSGID_SHORT		Message-ID is unusually short
+
+header MSGID_MULTIPLE_AT	MESSAGEID =~ /<[^>]*\@[^>]*\@/
+describe MSGID_MULTIPLE_AT	Message-ID contains multiple '@' characters
+
+###########################################################################
+
+header DATE_SPAMWARE_Y2K	Date =~ /^[A-Z][a-z]{2}, \d\d [A-Z][a-z]{2} [0-6]\d \d\d:\d\d:\d\d [A-Z]{3}$/
+describe DATE_SPAMWARE_Y2K	Date header uses unusual Y2K formatting
+
+# as noted on the dev@ list, ":60" is valid for seconds when there's a leap
+# second (12/31/2005 for instance), so let's accept that as valid.  ISO 8601
+# apparently allows for it.
+# WRT the tests, remember that ok and fail are reversed -- so valid dates
+# should be "fail" and invalid dates should be "ok".
+header INVALID_DATE		Date !~ /^\s*(?:(?i:Mon|Tue|Wed|Thu|Fri|Sat|Sun),\s)?\s*(?:[12]\d|3[01]|0?[1-9])\s+(?i:Jan|Feb|Ma[ry]|Apr|Ju[nl]|Aug|Sep|Oct|Nov|Dec)\s+(?:19[7-9]\d|2\d{3})\s+(?:[01]?\d|2[0-3])\:[0-5]\d(?::(?:[0-5]\d|60))?\s+(?:[AP]M\s+)?(?:[+-][0-9]{4}|UT|[A-Z]{2,3}T)(?:\s+\(.*\))?\s*$/ [if-unset: Wed, 31 Jul 2002 16:41:57 +0200]
+describe INVALID_DATE		Invalid Date: header (not RFC 2822)
+test INVALID_DATE fail    Sat, 31 Dec 2005 23:59:60 -0500
+test INVALID_DATE fail    Wed, 31 Jul 2002 16:41:57 +0200
+test INVALID_DATE ok      Sat, 31 Dec 2005 24:00:00 -0500
+test INVALID_DATE ok      Sat, 31 Dec 2005 23:00:00
+test INVALID_DATE ok      Thurs, 31 Jul 2002 16:41:57 +0200
+
+# allow +1300, NZ timezone
+header INVALID_DATE_TZ_ABSURD	Date =~ /[-+](?:1[4-9]\d\d|[2-9]\d\d\d)$/
+describe INVALID_DATE_TZ_ABSURD	Invalid Date: header (timezone does not exist)
+
+header INVALID_TZ_CST		ALL =~ /[+-]\d\d[30]0(?<!-0600|-0500|\+0800|\+0930|\+1030)\s+(?:\bCST\b|\(CST\))/
+describe INVALID_TZ_CST		Invalid date in header (wrong CST timezone)
+
+header INVALID_TZ_EST		ALL =~ /[+-]\d\d[30]0(?<!-0500|-0300|\+1000|\+1100)\s+(?:\bEST\b|\(EST\))/
+describe INVALID_TZ_EST		Invalid date in header (wrong EST timezone)
+
+
+###########################################################################
+# MIME encoding with spam characteristics
+
+header __SUBJECT_NEEDS_MIME	Subject =~ /[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff]/
+header __SUBJECT_ENCODED_QP	Subject:raw =~ /=\?\S+\?Q\?/i
+header __SUBJECT_ENCODED_B64	Subject:raw =~ /=\?\S+\?B\?/i
+
+
+
+header __FROM_NEEDS_MIME	From =~ /[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff]/
+header __FROM_ENCODED_QP	From:raw =~ /=\?\S+\?Q\?/i
+header __FROM_ENCODED_B64	From:raw =~ /=\?\S+\?B\?/i
+
+
+meta FROM_EXCESS_BASE64		__FROM_ENCODED_B64 && !__FROM_NEEDS_MIME
+describe FROM_EXCESS_BASE64	From: base64 encoded unnecessarily
+
+
+###########################################################################
+# ADV tags in various languages
+
+header ENGLISH_UCE_SUBJECT	Subject =~ /^[^0-9a-z]*adv(?:ert)?\b/i
+describe ENGLISH_UCE_SUBJECT	Subject contains an English UCE tag
+
+# alan premselaar <al...@12inch.com>, see SpamAssassin-talk list 2003-03
+# quinlan: 2003-03-23 here are more generic Japanese iso-2022-jp codes
+# ("not yet acceptance" or "email") + "announcement"
+# FWIW, according to Peter Evans, this should be sufficient to catch the
+# UCE tag and a common attempt at evasion (using the "sue" instead of
+# "mi" Chinese character).  2006-10-12: updated by bug 4021.
+header JAPANESE_UCE_SUBJECT     Subject =~ /\e\$B.*(?:L\$>5Bz|EE;R%a!<%k)(?:8x|9-)9p/
+describe JAPANESE_UCE_SUBJECT	Subject contains a Japanese UCE tag
+
+# check body for "shou nin daku kou koku" UCE tag (bug 4021)
+body __JAPANESE_UCE_BODY        /(?:L\$>5Bz|EE;R%a!<%k)(?:8x|9-)9p/
+
+meta JAPANESE_UCE_BODY (__ISO_2022_JP_DELIM && __JAPANESE_UCE_BODY)
+describe JAPANESE_UCE_BODY      Body contains Japanese UCE tag
+
+# quinlan: "advertisement" in Russian KOI8-R
+# (no longer common, but worth noting in future)
+#header RUSSIAN_UCE_SUBJECT	Subject =~ /\xf0\xe5\xea\xeb\xe0\xec\xf3/
+#describe RUSSIAN_UCE_SUBJECT	Subject contains a Russian UCE tag
+
+# Korean UCE Subject: lines are usually 8-bit, but are occasionally encoded
+# with quoted-printable or base64.
+#
+# \xbc\xba\xc0\xce means "adult"
+# \xb1\xa4\xb0\xed means "advertisement"
+# \xc1\xa4\xba\xb8 means "information"
+# \xc8\xab\xba\xb8 means "publicity"
+#
+# Each two byte sequence is one Korean letter; the spaces and periods are
+# sometimes used to obscure the words.  \xb1\xa4\xb0\xed is the most common
+# tag and is sometimes very obscured so we look harder.
+#
+header KOREAN_UCE_SUBJECT	Subject =~ /[({[<][. ]*(?-i:\xbc\xba[. ]*\xc0\xce[. ]*)?(?-i:\xb1\xa4(?:[. ]*|[\x00-\x7f]{0,3})\xb0\xed|\xc1\xa4[. ]*\xba\xb8|\xc8\xab[. ]*\xba\xb8)[. ]*[)}\]>]/
+describe KOREAN_UCE_SUBJECT	Subject: contains Korean unsolicited email tag
+
+###########################################################################
+
+# two reliable signatures
+header __DOUBLE_IP_SPAM_1	Received =~ /from \[\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\] by \d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3} with/
+header __DOUBLE_IP_SPAM_2	Received =~ /from\s+\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\s+by\s+\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3};/
+# loose match
+header __DOUBLE_IP_LOOSE	Received =~ /(?:\b(?:from|by)\b.{1,4}\b\d{1,3}[._-]\d{1,3}[._-]\d{1,3}[._-]\d{1,3}(?<!127\.0\.0\.1)\b.{0,4}){2}/i
+# spam signature
+meta RCVD_DOUBLE_IP_SPAM	(__DOUBLE_IP_SPAM_1 || __DOUBLE_IP_SPAM_2)
+describe RCVD_DOUBLE_IP_SPAM	Bulk email fingerprint (double IP) found
+# other matches
+meta RCVD_DOUBLE_IP_LOOSE	(__DOUBLE_IP_LOOSE && !RCVD_DOUBLE_IP_SPAM)
+describe RCVD_DOUBLE_IP_LOOSE   Received: by and from look like IP addresses
+
+header FORGED_TELESP_RCVD	Received =~ /\.(?!br).. \(\d+-\d+-\d+-\d+\.dsl\.telesp\.net\.br /
+describe FORGED_TELESP_RCVD	Contains forged hostname for a DSL IP in Brazil
+
+# forgery meta-rules: more reliable than their inputs
+meta CONFIRMED_FORGED		(__FORGED_RCVD_TRAIL && (__FORGED_AOL_RCVD || __FORGED_HOTMAIL_RCVD || __FORGED_EUDORAMAIL_RCVD || FORGED_YAHOO_RCVD || __FORGED_JUNO_RCVD || FORGED_GW05_RCVD))
+describe CONFIRMED_FORGED	Received headers are forged
+
+meta MULTI_FORGED		((__FORGED_AOL_RCVD + __FORGED_HOTMAIL_RCVD + __FORGED_EUDORAMAIL_RCVD + FORGED_YAHOO_RCVD + __FORGED_JUNO_RCVD + FORGED_GW05_RCVD) > 1)
+describe MULTI_FORGED		Received headers indicate multiple forgeries
+
+header NONEXISTENT_CHARSET	Content-Type =~ /charset=.?DEFAULT/
+describe NONEXISTENT_CHARSET	Character set doesn't exist
+
+
+
+header MISSING_DATE             Date =~ /^UNSET$/ [if-unset: UNSET]
+describe MISSING_DATE           Missing Date: header
+
+header __HAS_SUBJECT		exists:Subject
+meta MISSING_SUBJECT		!__HAS_SUBJECT
+describe MISSING_SUBJECT	Missing Subject: header
+
+header GAPPY_SUBJECT		Subject =~ /\b(?:[a-z]([-_. =~\/:,*!\@\#\$\%\^&+;\"\'<>\\])\1{0,2}){4}/i
+describe GAPPY_SUBJECT		Subject: contains G.a.p.p.y-T.e.x.t
+
+### header existence tests (description is added automatically)
+
+# X-Fix example: NTMail fixed non RFC822 compliant EMail message
+#
+# X-PMFLAGS is all caps
+#
+# Headers that seem to only be used by a single spamming software and
+# are found together in the same message:
+# 1. X-MailingID and X-ServerHost
+# 2. X-Stormpost-To and X-List-Unsubscribe
+#
+# not spammish: X-EM-Registration, X-EM-Version, X-Antiabuse, X-List-Host,
+# X-Message-Id
+# bad FP rate: Comment, Date-warning
+
+header PREVENT_NONDELIVERY	exists:Prevent-NonDelivery-Report
+describe PREVENT_NONDELIVERY	Message has Prevent-NonDelivery-Report header
+
+header X_IP			exists:X-IP
+describe X_IP			Message has X-IP header
+
+header   __HAS_MIMEOLE          exists:X-MimeOLE
+header   __HAS_MSMAIL_PRI       exists:X-MSMail-Priority
+header   __HAS_SQUIRRELMAIL_IN_MAILER	X-Mailer =~ /SquirrelMail\b/
+meta     MISSING_MIMEOLE	(__HAS_MSMAIL_PRI && !__HAS_MIMEOLE && !__HAS_SQUIRRELMAIL_IN_MAILER)
+describe MISSING_MIMEOLE	Message has X-MSMail-Priority, but no X-MimeOLE
+
+header __HAS_X_MAILER		exists:X-Mailer
+
+header __IS_EXCH		X-MimeOLE =~ /Produced By Microsoft Exchange V/
+
+header SUBJ_AS_SEEN		Subject =~ /\bAs Seen/i
+describe SUBJ_AS_SEEN		Subject contains "As Seen"
+
+header SUBJ_DOLLARS             Subject =~ /^\$[0-9.,]+\b/
+describe SUBJ_DOLLARS           Subject starts with dollar amount
+
+
+
+
+
+
+
+header SUBJ_YOUR_DEBT		Subject =~ /Your (?:Bills|Debt|Credit)/i
+describe SUBJ_YOUR_DEBT		Subject contains "Your Bills" or similar
+
+header SUBJ_YOUR_FAMILY		Subject =~ /Your Family/i
+describe SUBJ_YOUR_FAMILY	Subject contains "Your Family"
+
+
+# the real services never HELO as 'foo.com', instead 'mail.foo.com' or
+# something like that.  Note: be careful when expanding this... legit dotcom
+# HELOers include: hotmail.com, drizzle.com, lockergnome.com.
+header RCVD_FAKE_HELO_DOTCOM    Received =~ /^from (?:msn|yahoo|yourwebsite|lycos|excite|cs|aol|localhost|koreanmail|allexecs|mydomain|juno|eudoramail|compuserve|desertmail|excite|caramail)\.com \(/m
+describe RCVD_FAKE_HELO_DOTCOM  Received contains a faked HELO hostname
+
+header SUBJECT_DIET		Subject =~ /\bLose .*(?:pounds|lbs|weight)/i
+describe SUBJECT_DIET		Subject talks about losing pounds
+
+header EXTRA_MPART_TYPE         Content-Type =~ /(?:\s*multipart\/)?.* type=/i
+describe EXTRA_MPART_TYPE       Header has extraneous Content-type:...type= entry
+
+header TO_RECIP_MARKER          To =~ /\#recipient\#/
+describe TO_RECIP_MARKER        To header contains 'recipient' marker
+
+# MIME boundary tests; spam tools use distinctive patterns.
+header MIME_BOUND_DD_DIGITS	Content-Type =~ /boundary=\"--\d+\"/
+describe MIME_BOUND_DD_DIGITS	Spam tool pattern in MIME boundary
+header MIME_BOUND_DIGITS_7	Content-Type =~ /boundary=\d{9}\.\d{13}/
+describe MIME_BOUND_DIGITS_7	Spam tool pattern in MIME boundary
+header MIME_BOUND_DIGITS_15	Content-Type =~ /boundary=\"\d{15,}\"/
+describe MIME_BOUND_DIGITS_15	Spam tool pattern in MIME boundary
+header MIME_BOUND_MANY_HEX	Content-Type =~ /boundary="[\da-f]{8}(?:-[\da-f]{4}){3}-[\da-f]{12}"/
+describe MIME_BOUND_MANY_HEX	Spam tool pattern in MIME boundary
+header __NEXTPART_ALL		Content-Type =~ /NextPart/
+header __NEXTPART_NORMAL	Content-Type =~ /="(?:----_?=_)?NextPart_[\dA-F]{3}(_[\dA-F]{3,8})?_[\dA-F]{8}\.[\dA-F]{8}"/
+meta MIME_BOUND_NEXTPART	(__NEXTPART_ALL && !__NEXTPART_NORMAL)
+describe MIME_BOUND_NEXTPART	Spam tool pattern in MIME boundary
+
+# note: the first alternation is anchored for speed
+header TO_MALFORMED             To !~ /(?:^|[^\S"])(?:(?:\"[^\"]+\"|\S+)\@\S+\.\S+|^\s*.+:\s*;|^\s*\"[^\"]+\":\s*;|^\s*\([^\)]*\)\s*$|<\S+(?:\!\S+){1,}>|^\s*$)/ [if-unset: unset@unset.unset]
+describe TO_MALFORMED           To: has a malformed address
+
+header __CD                     exists:Content-Disposition
+header __CT                     exists:Content-Type
+header __CTE                    exists:Content-Transfer-Encoding
+header __MIME_VERSION           exists:MIME-Version
+header __CT_TEXT_PLAIN          Content-Type =~ /^text\/plain\b/i
+meta MIME_HEADER_CTYPE_ONLY     (!__CD && !__CTE && __CT && !__MIME_VERSION && !__CT_TEXT_PLAIN)
+describe MIME_HEADER_CTYPE_ONLY 'Content-Type' found without required MIME headers
+
+header WITH_LC_SMTP		Received =~ /\swith\ssmtp;\s/
+describe WITH_LC_SMTP		Received line contains spam-sign (lowercase smtp)
+
+
+header SUBJ_BUY                 Subject =~ /^buy/i
+describe SUBJ_BUY               Subject line starts with Buy or Buying
+
+# seems to be ratware
+header RCVD_AM_PM		Received =~ /; [A-Z][a-z][a-z], \d{1,2} \d{4} \d{1,2}:\d\d:\d\d [AP]M [+-]\d{4}/
+describe RCVD_AM_PM		Received headers forged (AM/PM)
+
+header __USER_AGENT_MSN             X-Mailer =~ /^MSN Explorer /
+
+# host no longer exists according to administrator
+header FAKE_OUTBLAZE_RCVD	Received =~ /\.mr\.outblaze\.com/
+describe FAKE_OUTBLAZE_RCVD	Received header contains faked 'mr.outblaze.com'
+
+header SUBJ_2_NUM_PARENS        Subject =~ /^\(\d+\).*\(\d+\)\s*$/
+describe SUBJ_2_NUM_PARENS      Subject contains common spam sign (2 numbers)
+
+# thanks to David Ritz for passing this on; ready for post-3.0.0
+header UNCLOSED_BRACKET		ALL =~ /\[\d+\r?\n/s
+describe UNCLOSED_BRACKET	Headers contain an unclosed bracket
+
+header FROM_DOMAIN_NOVOWEL	From =~ /\@\S*[bcdfghjklmnpqrstvwxz]{7}/i
+describe FROM_DOMAIN_NOVOWEL	From: domain has series of non-vowel letters
+
+header FROM_LOCAL_NOVOWEL	From =~ /[bcdfghjklmnpqrstvwxz]{7}\S*\@/i
+describe FROM_LOCAL_NOVOWEL	From: localpart has series of non-vowel letters
+
+header FROM_LOCAL_HEX		From =~ /[0-9a-f]{11}\S*\@/i
+describe FROM_LOCAL_HEX		From: localpart has long hexadecimal sequence
+
+header FROM_LOCAL_DIGITS	From =~ /\d{11}\S*\@/i
+describe FROM_LOCAL_DIGITS	From: localpart has long digit sequence
+
+header __TOCC_EXISTS		exists:ToCc
+
+header X_PRIORITY_CC		ALL =~ /\nX-Priority:[^\n]{0,80}\nCc:/si
+describe X_PRIORITY_CC		Cc: after X-Priority: (bulk email fingerprint)
+
+# catch non-RFC2047 compliant messages
+# Apple Mail has a bug where headers will have whitespace around the encoded
+# text, so try to ignore that
+header BAD_ENC_HEADER		ALL =~ /=\?[^?\s]+\?[^?\s]\?\s*[^?]+\s(?!\?=)/
+describe BAD_ENC_HEADER		Message has bad MIME encoding in the header
+
+###########################################################################
+
+ifplugin Mail::SpamAssassin::Plugin::HeaderEval
+
+header __FORGED_AOL_RCVD	        eval:check_for_fake_aol_relay_in_rcvd()
+
+header CHARSET_FARAWAY_HEADER	eval:check_for_faraway_charset_in_headers()
+describe CHARSET_FARAWAY_HEADER	A foreign language charset used in headers
+tflags CHARSET_FARAWAY_HEADER	userconf
+
+    ###################################################################
+
+# illegal characters that should be MIME encoded
+# might want to exempt users using languages that don't use Latin
+# alphabets, but do it in the eval
+
+header SUBJ_ILLEGAL_CHARS	eval:check_illegal_chars('Subject','0.00','2')
+describe SUBJ_ILLEGAL_CHARS	Subject: has too many raw illegal characters
+
+header FROM_ILLEGAL_CHARS	eval:check_illegal_chars('From','0.20','2')
+describe FROM_ILLEGAL_CHARS	From: has too many raw illegal characters
+
+header HEAD_ILLEGAL_CHARS	eval:check_illegal_chars('ALL','0.010','2')
+describe HEAD_ILLEGAL_CHARS	Headers have too many raw illegal characters
+
+    ###################################################################
+
+# a forged Hotmail message; host HELO'd as hotmail.com, but it wasn't
+header __FORGED_HOTMAIL_RCVD	eval:check_for_forged_hotmail_received_headers()
+
+# this, by comparison is more common: from was @hotmail.com, but it wasn't
+header FORGED_HOTMAIL_RCVD2	eval:check_for_no_hotmail_received_headers()
+describe FORGED_HOTMAIL_RCVD2 hotmail.com 'From' address, but no 'Received:'
+
+header __FORGED_EUDORAMAIL_RCVD	eval:check_for_forged_eudoramail_received_headers()
+
+header FORGED_YAHOO_RCVD	eval:check_for_forged_yahoo_received_headers()
+describe FORGED_YAHOO_RCVD	'From' yahoo.com does not match 'Received' headers
+
+header __FORGED_JUNO_RCVD		eval:check_for_forged_juno_received_headers()
+
+header FORGED_GW05_RCVD		eval:check_for_forged_gw05_received_headers()
+describe FORGED_GW05_RCVD	Forged 'by gw05' 'Received:' header found
+
+
+header SORTED_RECIPS		eval:sorted_recipients()
+describe SORTED_RECIPS		Recipient list is sorted by address
+
+header SUSPICIOUS_RECIPS	eval:similar_recipients('0.65','undef')
+describe SUSPICIOUS_RECIPS	Similar addresses in recipient list
+
+# this is a quite common false positive, as it's legal to remove a To but leave
+# a CC. so don't score it high.
+header MISSING_HEADERS		eval:check_for_missing_to_header()
+describe MISSING_HEADERS	Missing To: header
+
+# this variant is local, using the Received hdr itself...
+header ROUND_THE_WORLD_LOCAL	eval:check_for_round_the_world_received_helo()
+describe ROUND_THE_WORLD_LOCAL	Received: says mail sent around the world (HELO)
+
+header DATE_IN_PAST_03_06	eval:check_for_shifted_date('-6', '-3')
+describe DATE_IN_PAST_03_06	Date: is 3 to 6 hours before Received: date
+
+header DATE_IN_PAST_06_12	eval:check_for_shifted_date('-12', '-6')
+describe DATE_IN_PAST_06_12	Date: is 6 to 12 hours before Received: date
+
+header DATE_IN_PAST_12_24	eval:check_for_shifted_date('-24', '-12')
+describe DATE_IN_PAST_12_24	Date: is 12 to 24 hours before Received: date
+
+header DATE_IN_PAST_24_48	eval:check_for_shifted_date('-48', '-24')
+describe DATE_IN_PAST_24_48	Date: is 24 to 48 hours before Received: date
+
+
+header DATE_IN_PAST_96_XX	eval:check_for_shifted_date('undef', '-96')
+describe DATE_IN_PAST_96_XX	Date: is 96 hours or more before Received: date
+
+header DATE_IN_FUTURE_03_06	eval:check_for_shifted_date('3', '6')
+describe DATE_IN_FUTURE_03_06	Date: is 3 to 6 hours after Received: date
+
+header DATE_IN_FUTURE_06_12	eval:check_for_shifted_date('6', '12')
+describe DATE_IN_FUTURE_06_12	Date: is 6 to 12 hours after Received: date
+
+header DATE_IN_FUTURE_12_24	eval:check_for_shifted_date('12', '24')
+describe DATE_IN_FUTURE_12_24	Date: is 12 to 24 hours after Received: date
+
+header DATE_IN_FUTURE_24_48	eval:check_for_shifted_date('24', '48')
+describe DATE_IN_FUTURE_24_48	Date: is 24 to 48 hours after Received: date
+
+header DATE_IN_FUTURE_48_96	eval:check_for_shifted_date('48', '96')
+describe DATE_IN_FUTURE_48_96	Date: is 48 to 96 hours after Received: date
+
+header DATE_IN_FUTURE_96_XX	eval:check_for_shifted_date('96', 'undef')
+describe DATE_IN_FUTURE_96_XX	Date: is 96 hours or more after Received: date
+
+header UNRESOLVED_TEMPLATE	eval:check_unresolved_template()
+describe UNRESOLVED_TEMPLATE	Headers contain an unresolved template
+
+header SUBJ_ALL_CAPS		eval:subject_is_all_caps()
+describe SUBJ_ALL_CAPS		Subject is all capitals
+
+
+header LOCALPART_IN_SUBJECT	eval:check_for_to_in_subject('user')
+describe LOCALPART_IN_SUBJECT	Local part of To: address appears in Subject
+
+header MSGID_OUTLOOK_INVALID	eval:check_outlook_message_id()
+describe MSGID_OUTLOOK_INVALID	Message-Id is fake (in Outlook Express format)
+
+header HEADER_COUNT_CTYPE	eval:check_header_count_range('Content-Type','2','999')
+describe HEADER_COUNT_CTYPE	Multiple Content-Type headers found
+
+endif
+
+###########################################################################
+
+ifplugin Mail::SpamAssassin::Plugin::MIMEEval
+
+header MISSING_HB_SEP		eval:check_msg_parse_flags('missing_head_body_separator')
+describe MISSING_HB_SEP		Missing blank line between message header and body
+tflags MISSING_HB_SEP		userconf
+
+endif
+
+###########################################################################
+
+ifplugin Mail::SpamAssassin::Plugin::RelayEval
+
+header UNPARSEABLE_RELAY        eval:check_relays_unparseable()
+tflags UNPARSEABLE_RELAY        userconf
+describe UNPARSEABLE_RELAY      Informational: message has unparseable relay lines
+
+
+
+header RCVD_HELO_IP_MISMATCH	eval:helo_ip_mismatch()
+describe RCVD_HELO_IP_MISMATCH	Received: HELO and IP do not match, but should
+
+header RCVD_NUMERIC_HELO	eval:check_for_numeric_helo()
+describe RCVD_NUMERIC_HELO	Received: contains an IP address used for HELO
+
+header RCVD_ILLEGAL_IP		eval:check_for_illegal_ip()
+describe RCVD_ILLEGAL_IP	Received: contains illegal IP address
+
+# not used directly right now due to FPs; but CONFIRMED_FORGED turns it
+# into a 1.0 S/O rule anyway, so that's not a problem ;)
+# 2.626   3.6340   1.5251    0.704   0.34    1.44  FORGED_RCVD_TRAIL
+# 0.956   3.3890   0.0000    1.000   0.98    4.30  CONFIRMED_FORGED
+header __FORGED_RCVD_TRAIL	eval:check_for_forged_received_trail()
+
+header NO_RDNS_DOTCOM_HELO	eval:check_for_no_rdns_dotcom_helo()
+describe NO_RDNS_DOTCOM_HELO	Host HELO'd as a big ISP, but had no rDNS
+
+endif
+
 ifplugin Mail::SpamAssassin::Plugin::HeaderEval
 
 header __ENV_AND_HDR_FROM_MATCH	eval:check_for_matching_env_and_hdr_from()
 
 endif
+

Modified: spamassassin/branches/jm_re2c_hacks/rules/20_html_tests.cf
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/rules/20_html_tests.cf?view=diff&rev=467701&r1=467700&r2=467701
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/rules/20_html_tests.cf (original)
+++ spamassassin/branches/jm_re2c_hacks/rules/20_html_tests.cf Wed Oct 25 09:15:31 2006
@@ -36,14 +36,10 @@
 describe HTML_SHORT_LINK_IMG_2	HTML is very short with a linked image
 describe HTML_SHORT_LINK_IMG_3	HTML is very short with a linked image
 
-meta HTML_SHORT_COMMENT		(__HTML_LENGTH_512 && __COMMENT_EXISTS)
-describe HTML_SHORT_COMMENT	HTML is very short with HTML comments
 
 meta HTML_SHORT_CENTER		(__HTML_LENGTH_384 && __TAG_EXISTS_CENTER)
 describe HTML_SHORT_CENTER	HTML is very short with CENTER tag
 
-meta HTML_TITLE_LONG		__HTML_TITLE_120 && !__MIME_ATTACHMENT
-describe HTML_TITLE_LONG	HTML title is very long
 
 meta HTML_TITLE_SUBJ_DIFF	__HTML_TITLE_SUBJ_DIFF && !__MIME_ATTACHMENT
 
@@ -75,23 +71,11 @@
 meta JS_FROMCHARCODE            (__JS_FROMCHARCODE && __JS_DOCWRITE)
 describe JS_FROMCHARCODE        Document is built from a Javascript charcode array
 
-# A-Z, a-z, 0-9
-rawbody ENTITY_DEC_ALPHANUM	/\&\#0*(?:4[89]|5[0-7]|6[5-9][78]\d|9[0789]|1[01]\d|12[012])\;/
-describe ENTITY_DEC_ALPHANUM	HTML contains needlessly encoded characters
-
-# ! $ % ' ( ) , - . / : ; = ? @ _
 # a good possible rule that may resurface
+# ! $ % ' ( ) , - . / : ; = ? @ _
 #rawbody ENTITY_DEC_OTHER	/\&\#0*(?:3[3679]|4[014567]|5[89]|6[134]|95)\;/
 #describe ENTITY_DEC_OTHER	HTML contains needlessly encoded punctuation
 
-# thanks to Bob Menschel for this one; bug 4116
-rawbody   HTML_EHTML2         m'</html></html>'i
-describe  HTML_EHTML2         HTML has doubled end HTML tag
-
-# bug 3070
-rawbody HTML_TINY_FONT	/\<.*font\-size\:[ \"]*[01][^0-9]+.*\>/i
-describe HTML_TINY_FONT	body contains 1 or 0-point font
-
 body __HIGHBITS                     /(?:[\x80-\xff].?){4}/
 # note: __HIGHBITS is used by HTML_CHARSET_FARAWAY
 
@@ -103,48 +87,6 @@
 body HTML_MESSAGE		eval:html_test('html')
 describe HTML_MESSAGE		HTML included in message
 
-# the HTML percentage range
-# should really be converted into a numeric function test
-body HTML_00_10			eval:html_range('ratio','0.00','0.10')
-body HTML_10_20			eval:html_range('ratio','0.10','0.20')
-body HTML_20_30			eval:html_range('ratio','0.20','0.30')
-body HTML_30_40			eval:html_range('ratio','0.30','0.40')
-body HTML_40_50			eval:html_range('ratio','0.40','0.50')
-body HTML_50_60			eval:html_range('ratio','0.50','0.60')
-body HTML_60_70			eval:html_range('ratio','0.60','0.70')
-body HTML_70_80			eval:html_range('ratio','0.70','0.80')
-body HTML_80_90			eval:html_range('ratio','0.80','0.90')
-body HTML_90_100		eval:html_range('ratio','0.90','1.00')
-describe HTML_00_10		Message is 0% to 10% HTML
-describe HTML_10_20		Message is 10% to 20% HTML
-describe HTML_20_30		Message is 20% to 30% HTML
-describe HTML_30_40		Message is 30% to 40% HTML
-describe HTML_40_50		Message is 40% to 50% HTML
-describe HTML_50_60		Message is 50% to 60% HTML
-describe HTML_60_70		Message is 60% to 70% HTML
-describe HTML_70_80		Message is 70% to 80% HTML
-describe HTML_80_90		Message is 80% to 90% HTML
-describe HTML_90_100		Message is 90% to 100% HTML
-
-# HTML shouting range
-# should really be converted into a numeric function test
-body HTML_SHOUTING3		eval:html_range('max_shouting','2','3')
-body HTML_SHOUTING4		eval:html_range('max_shouting','3','4')
-body HTML_SHOUTING5		eval:html_range('max_shouting','4','5')
-body HTML_SHOUTING6		eval:html_range('max_shouting','5','6')
-body HTML_SHOUTING7		eval:html_range('max_shouting','6','7')
-describe HTML_SHOUTING3		HTML has very strong "shouting" markup
-describe HTML_SHOUTING4		HTML has very strong "shouting" markup
-describe HTML_SHOUTING5		HTML has very strong "shouting" markup
-describe HTML_SHOUTING6		HTML has very strong "shouting" markup
-describe HTML_SHOUTING7		HTML has very strong "shouting" markup
-
-body HTML_TEXT_AFTER_HTML	eval:html_test('text_after_html')
-describe HTML_TEXT_AFTER_HTML	HTML contains text after HTML close tag
-
-body HTML_TEXT_AFTER_BODY	eval:html_test('text_after_body')
-describe HTML_TEXT_AFTER_BODY	HTML contains text after BODY close tag
-
 # HTML comment tests
 body HTML_COMMENT_SHORT		eval:html_text_match('comment', '<!(?!-).{0,6}>')
 describe HTML_COMMENT_SHORT	HTML comment is very short
@@ -155,17 +97,11 @@
 body HTML_EMBEDS		eval:html_test('embeds')
 describe HTML_EMBEDS		HTML with embedded plugin object
 
-body HTML_EVENT_UNSAFE		eval:html_test('html_event_unsafe')
-describe HTML_EVENT_UNSAFE	HTML contains unsafe auto-executing code
 
 body HTML_EXTRA_CLOSE		eval:html_range('closed_extra_ratio', '0.09', 'inf')
 describe HTML_EXTRA_CLOSE	HTML contains far too many close tags
 
-body HTML_FONT_SIZE_TINY	eval:html_eval('min_size', '< 1')
-describe HTML_FONT_SIZE_TINY	HTML font size is tiny
 
-body HTML_FONT_SIZE_NONE	eval:html_eval('min_size', '< 0')
-describe HTML_FONT_SIZE_NONE	HTML font size is negative
 
 body HTML_FONT_SIZE_LARGE	eval:html_range('max_size', '5', '6')
 describe HTML_FONT_SIZE_LARGE	HTML font size is large
@@ -173,14 +109,8 @@
 body HTML_FONT_SIZE_HUGE	eval:html_range('max_size', '6', 'inf')
 describe HTML_FONT_SIZE_HUGE	HTML font size is huge
 
-body HTML_FONT_BIG		eval:html_test('big_font')
-describe HTML_FONT_BIG		HTML tag for a big font size
 
-body HTML_FONT_TINY		eval:html_test('tiny_font')
-describe HTML_FONT_TINY		HTML tag for a tiny font size
 
-body HTML_FONT_INVISIBLE	eval:html_test('font_invisible')
-describe HTML_FONT_INVISIBLE	HTML font color is same as background
 
 body HTML_FONT_LOW_CONTRAST	eval:html_test('font_low_contrast')
 describe HTML_FONT_LOW_CONTRAST	HTML font color similar to background
@@ -188,8 +118,6 @@
 body HTML_FONT_FACE_BAD		eval:html_test('font_face_bad')
 describe HTML_FONT_FACE_BAD	HTML font face is not a word
 
-body HTML_FONT_FACE_CAPS	eval:html_test('font_face_caps')
-describe HTML_FONT_FACE_CAPS	HTML font face has excess capital characters
 
 body HTML_FORMACTION_MAILTO	eval:html_test('form_action_mailto')
 describe HTML_FORMACTION_MAILTO	HTML includes a form which sends mail
@@ -214,56 +142,24 @@
 
 # HTML_IMAGE_RATIO - more image area than text (ratio)
 body HTML_IMAGE_RATIO_02	eval:html_image_ratio('0.000','0.002')
-body HTML_IMAGE_RATIO_04	eval:html_image_ratio('0.002','0.004')
-body HTML_IMAGE_RATIO_06	eval:html_image_ratio('0.004','0.006')
-body HTML_IMAGE_RATIO_08	eval:html_image_ratio('0.006','0.008')
 describe HTML_IMAGE_RATIO_02	HTML has a low ratio of text to image area
-describe HTML_IMAGE_RATIO_04	HTML has a low ratio of text to image area
-describe HTML_IMAGE_RATIO_06	HTML has a low ratio of text to image area
-describe HTML_IMAGE_RATIO_08	HTML has a low ratio of text to image area
-
-body HTML_LINK_PUSH_HERE	eval:html_text_match('anchor', '(?i)(?:push|go|cl[1l]ck)\s*(?:here|this)')
-describe HTML_LINK_PUSH_HERE	HTML link text says "push here" or similar
-
-body HTML_LINK_OPT_OUT		eval:html_text_match('anchor', '(?i)opt.?out')
-describe HTML_LINK_OPT_OUT	HTML link text says "opt out" or similar
 
 # HTML obfuscation
 body HTML_OBFUSCATE_05_10	eval:html_range('obfuscation_ratio','.05','.1')
 body HTML_OBFUSCATE_10_20	eval:html_range('obfuscation_ratio','.1','.2')
 body HTML_OBFUSCATE_20_30	eval:html_range('obfuscation_ratio','.2','.3')
 body HTML_OBFUSCATE_30_40	eval:html_range('obfuscation_ratio','.3','.4')
-body HTML_OBFUSCATE_40_50	eval:html_range('obfuscation_ratio','.4','.5')
 body HTML_OBFUSCATE_50_60	eval:html_range('obfuscation_ratio','.5','.6')
-body HTML_OBFUSCATE_60_70	eval:html_range('obfuscation_ratio','.6','.7')
 body HTML_OBFUSCATE_70_80	eval:html_range('obfuscation_ratio','.7','.8')
-body HTML_OBFUSCATE_80_90	eval:html_range('obfuscation_ratio','.8','.9')
 body HTML_OBFUSCATE_90_100	eval:html_range('obfuscation_ratio','.9','1.0')
 describe HTML_OBFUSCATE_05_10	Message is 5% to 10% HTML obfuscation
 describe HTML_OBFUSCATE_10_20	Message is 10% to 20% HTML obfuscation
 describe HTML_OBFUSCATE_20_30	Message is 20% to 30% HTML obfuscation
 describe HTML_OBFUSCATE_30_40	Message is 30% to 40% HTML obfuscation
-describe HTML_OBFUSCATE_40_50	Message is 40% to 50% HTML obfuscation
 describe HTML_OBFUSCATE_50_60	Message is 50% to 60% HTML obfuscation
-describe HTML_OBFUSCATE_60_70	Message is 60% to 70% HTML obfuscation
 describe HTML_OBFUSCATE_70_80	Message is 70% to 80% HTML obfuscation
-describe HTML_OBFUSCATE_80_90	Message is 80% to 90% HTML obfuscation
 describe HTML_OBFUSCATE_90_100	Message is 90% to 100% HTML obfuscation
 
-# backhair - idea from backhair set by Jennifer Wheeler and Adam Lopresto.
-body HTML_BACKHAIR_2		eval:html_range('backhair_count', '1', '4')
-body HTML_BACKHAIR_4		eval:html_range('backhair_count', '4', '8')
-body HTML_BACKHAIR_8		eval:html_range('backhair_count', '8', 'inf')
-describe HTML_BACKHAIR_2	HTML tags used to obfuscate words
-describe HTML_BACKHAIR_4	HTML tags used to obfuscate words
-describe HTML_BACKHAIR_8	HTML tags used to obfuscate words
-
-# HTML attribute testing
-body HTML_ATTR_BAD		eval:html_range('attr_bad','0.75','1.0')
-describe HTML_ATTR_BAD		HTML has many bad attributes in tags
-body HTML_ATTR_UNIQUE		eval:html_range('attr_unique_bad','0.5','1.0')
-describe HTML_ATTR_UNIQUE	HTML appears to have random attributes in tags
-
 body HTML_TAG_BALANCE_BODY	eval:html_tag_balance('body', '!= 0')
 describe HTML_TAG_BALANCE_BODY	HTML has unbalanced "body" tags
 
@@ -273,55 +169,25 @@
 body HTML_TAG_EXIST_BGSOUND	eval:html_tag_exists('bgsound')
 describe HTML_TAG_EXIST_BGSOUND	HTML has "bgsound" tag
 
-body HTML_TAG_EXIST_MARQUEE	eval:html_tag_exists('marquee')
-describe HTML_TAG_EXIST_MARQUEE	HTML has "marquee" tag
-
-body HTML_TAG_EXIST_TBODY	eval:html_tag_exists('tbody')
-describe HTML_TAG_EXIST_TBODY	HTML has "tbody" tag
-
 # percentage of tags that are not legal elements in HTML
-body HTML_BADTAG_00_10	eval:html_range('bad_tag_ratio','0.00','0.10')
-body HTML_BADTAG_10_20	eval:html_range('bad_tag_ratio','0.10','0.20')
-body HTML_BADTAG_20_30	eval:html_range('bad_tag_ratio','0.20','0.30')
-body HTML_BADTAG_30_40	eval:html_range('bad_tag_ratio','0.30','0.40')
 body HTML_BADTAG_40_50	eval:html_range('bad_tag_ratio','0.40','0.50')
 body HTML_BADTAG_50_60	eval:html_range('bad_tag_ratio','0.50','0.60')
 body HTML_BADTAG_60_70	eval:html_range('bad_tag_ratio','0.60','0.70')
-body HTML_BADTAG_70_80	eval:html_range('bad_tag_ratio','0.70','0.80')
-body HTML_BADTAG_80_90	eval:html_range('bad_tag_ratio','0.80','0.90')
 body HTML_BADTAG_90_100	eval:html_range('bad_tag_ratio','0.90','1.00')
-describe HTML_BADTAG_00_10	HTML message is 0% to 10% bad tags
-describe HTML_BADTAG_10_20	HTML message is 10% to 20% bad tags
-describe HTML_BADTAG_20_30	HTML message is 20% to 30% bad tags
-describe HTML_BADTAG_30_40	HTML message is 30% to 40% bad tags
 describe HTML_BADTAG_40_50	HTML message is 40% to 50% bad tags
 describe HTML_BADTAG_50_60	HTML message is 50% to 60% bad tags
 describe HTML_BADTAG_60_70	HTML message is 60% to 70% bad tags
-describe HTML_BADTAG_70_80	HTML message is 70% to 80% bad tags
-describe HTML_BADTAG_80_90	HTML message is 80% to 90% bad tags
 describe HTML_BADTAG_90_100	HTML message is 90% to 100% bad tags
 
 # percentage of unique non-elements in HTML
-body HTML_NONELEMENT_00_10	eval:html_range('non_element_ratio','0.00','0.10')
-body HTML_NONELEMENT_10_20	eval:html_range('non_element_ratio','0.10','0.20')
-body HTML_NONELEMENT_20_30	eval:html_range('non_element_ratio','0.20','0.30')
 body HTML_NONELEMENT_30_40	eval:html_range('non_element_ratio','0.30','0.40')
 body HTML_NONELEMENT_40_50	eval:html_range('non_element_ratio','0.40','0.50')
-body HTML_NONELEMENT_50_60	eval:html_range('non_element_ratio','0.50','0.60')
 body HTML_NONELEMENT_60_70	eval:html_range('non_element_ratio','0.60','0.70')
-body HTML_NONELEMENT_70_80	eval:html_range('non_element_ratio','0.70','0.80')
 body HTML_NONELEMENT_80_90	eval:html_range('non_element_ratio','0.80','0.90')
-body HTML_NONELEMENT_90_100	eval:html_range('non_element_ratio','0.90','1.00')
-describe HTML_NONELEMENT_00_10	0% to 10% of HTML elements are non-standard
-describe HTML_NONELEMENT_10_20	10% to 20% of HTML elements are non-standard
-describe HTML_NONELEMENT_20_30	20% to 30% of HTML elements are non-standard
 describe HTML_NONELEMENT_30_40	30% to 40% of HTML elements are non-standard
 describe HTML_NONELEMENT_40_50	40% to 50% of HTML elements are non-standard
-describe HTML_NONELEMENT_50_60	50% to 60% of HTML elements are non-standard
 describe HTML_NONELEMENT_60_70	60% to 70% of HTML elements are non-standard
-describe HTML_NONELEMENT_70_80	70% to 80% of HTML elements are non-standard
 describe HTML_NONELEMENT_80_90	80% to 90% of HTML elements are non-standard
-describe HTML_NONELEMENT_90_100	90% to 100% of HTML elements are non-standard
 
 # short HTML messages with certain attributes
 body __HTML_LINK_IMAGE		eval:html_text_match('anchor', '<img>')
@@ -329,32 +195,21 @@
 body __HTML_LENGTH_1024_1536	eval:html_range('length', '1024', '1536')
 body __HTML_LENGTH_1536_2048	eval:html_range('length', '1536', '2048')
 
-body HTML_SHORT_LENGTH		eval:html_eval('length', '< 170')
-describe HTML_SHORT_LENGTH	HTML is extremely short
-
 body __HTML_LENGTH_512		eval:html_eval('length', '< 512')
 body __COMMENT_EXISTS		eval:html_text_match('comment', '<!.*?>')
 
 body __HTML_LENGTH_384		eval:html_eval('length', '< 384')
 body __TAG_EXISTS_CENTER	eval:html_tag_exists('center')
 
-body HTML_TITLE_EMPTY		eval:html_text_not_match('title', '(?s)\S')
-describe HTML_TITLE_EMPTY	HTML title contains no text
-
 body __HTML_TITLE_120		eval:html_text_match('title', '.{120}')
 
 body __HTML_TITLE_SUBJ_DIFF	eval:html_title_subject_ratio('3.5')
 
-body HTML_TITLE_UNTITLED	eval:html_text_match('title', '(?i)(?:untitled|new page \d+)')
-describe HTML_TITLE_UNTITLED	HTML title contains "Untitled"
 
 body __HTML_CHARSET_FARAWAY	eval:html_charset_faraway()
 
 body HTML_IFRAME_SRC	eval:check_iframe_src()
 describe HTML_IFRAME_SRC	Message has HTML IFRAME tag with SRC URI
-
-body URI_HTML_ONLY	eval:check_html_uri_only()
-describe URI_HTML_ONLY	URIs only found in HTML part of multipart/alternative message
 
 endif
 

Modified: spamassassin/branches/jm_re2c_hacks/rules/25_replace.cf
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/rules/25_replace.cf?view=diff&rev=467701&r1=467700&r2=467701
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/rules/25_replace.cf (original)
+++ spamassassin/branches/jm_re2c_hacks/rules/25_replace.cf Wed Oct 25 09:15:31 2006
@@ -104,9 +104,6 @@
 describe FUZZY_BILLION	Attempt to obfuscate words in spam
 replace_rules FUZZY_BILLION
 
-body FUZZY_CELEBREX	/<inter W1><post P2>(?!celebrex)<C><E><L><E><B><R><E><X>/i
-describe FUZZY_CELEBREX	Attempt to obfuscate words in spam
-replace_rules FUZZY_CELEBREX
 
 body FUZZY_CPILL	/(?!ciali[sz])<C><I><A><L><I><S>/i
 describe FUZZY_CPILL	Attempt to obfuscate words in spam
@@ -120,9 +117,6 @@
 describe FUZZY_ERECT	Attempt to obfuscate words in spam
 replace_rules FUZZY_ERECT
 
-body FUZZY_FOLLOW	/(?!follow)<F><O><L><L><O><W>/i
-describe FUZZY_FOLLOW	Attempt to obfuscate words in spam
-replace_rules FUZZY_FOLLOW
 
 body FUZZY_GUARANTEE	/<inter W1><post P2>(?!guarantee)<G><U><A><R><A><N><T><E><E>/i
 describe FUZZY_GUARANTEE	Attempt to obfuscate words in spam
@@ -132,9 +126,6 @@
 describe FUZZY_MEDICATION	Attempt to obfuscate words in spam
 replace_rules FUZZY_MEDICATION
 
-body FUZZY_MILF		/<inter SP>(?!milf)\b<M><I><L><F>/i
-describe FUZZY_MILF	Attempt to obfuscate words in spam
-replace_rules FUZZY_MILF
 
 body FUZZY_MILLION	/(?!million)<M><I><L><L><I><O><N>/i
 describe FUZZY_MILLION	Attempt to obfuscate words in spam
@@ -164,9 +155,6 @@
 describe FUZZY_PHENT	Attempt to obfuscate words in spam
 replace_rules FUZZY_PHENT
 
-body FUZZY_PLEASE	/(?!please)<P><L><E><A><S><E>/i
-describe FUZZY_PLEASE	Attempt to obfuscate words in spam
-replace_rules FUZZY_PLEASE
 
 body FUZZY_PRESCRIPT	/<inter W2><post P2>(?!prescription)<P><R><E><S><C><R><I><P><T><I><O><N>/i
 describe FUZZY_PRESCRIPT	Attempt to obfuscate words in spam
@@ -197,17 +185,11 @@
 describe FUZZY_THOUSANDS	Attempt to obfuscate words in spam
 replace_rules FUZZY_THOUSANDS
 
-body FUZZY_TRAMADOL	/<inter W1><post P2>(?!tramadol)<T><R><A><M><A><D><O><L>/i
-describe FUZZY_TRAMADOL	Attempt to obfuscate words in spam
-replace_rules FUZZY_TRAMADOL
 
 body FUZZY_VLIUM	/<inter W1><post P2>(?!valium)<V><A><L><I><U><M>/i
 describe FUZZY_VLIUM	Attempt to obfuscate words in spam
 replace_rules FUZZY_VLIUM
 
-body FUZZY_VICODIN	/<inter W1><post P2>(?!vicodin)<V><I><C><O><D><I><N>/i
-describe FUZZY_VICODIN	Attempt to obfuscate words in spam
-replace_rules FUZZY_VICODIN
 
 body FUZZY_VIOXX	/<inter W1><post P2>(?!vioxx)<V><I><O><X><X>/i
 describe FUZZY_VIOXX	Attempt to obfuscate words in spam

Modified: spamassassin/branches/jm_re2c_hacks/rules/active.list
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/rules/active.list?view=diff&rev=467701&r1=467700&r2=467701
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/rules/active.list (original)
+++ spamassassin/branches/jm_re2c_hacks/rules/active.list Wed Oct 25 09:15:31 2006
@@ -1,11 +1,5 @@
 # active ruleset list, automatically generated from http://ruleqa.spamassassin.org/
-# with results from: bb-doc bb-jm bb-zmi cthielen daf dos parkerm theo zmi
-
-# good enough
-ADVANCE_FEE_3
-
-# good enough
-ADVANCE_FEE_4
+# with results from: cthielen daf parkerm zmi
 
 # tflags userconf
 ALL_TRUSTED
@@ -17,7 +11,7 @@
 AXB_FAKETZ
 
 # good enough
-BANG_OPRAH
+AXB_XR_STULDAP
 
 # good enough
 BASE64_LENGTH_78
@@ -62,37 +56,7 @@
 BROKEN_RATWARE_BOM
 
 # good enough
-CONFIRMED_FORGED
-
-# good enough
-CUM_SHOT
-
-# good enough
-DATE_IN_FUTURE_03_06
-
-# good enough
-DATE_IN_FUTURE_06_12
-
-# good enough
-DATE_IN_FUTURE_12_24
-
-# good enough
-DATE_IN_FUTURE_24_48
-
-# good enough
-DATE_IN_FUTURE_48_96
-
-# good enough
-DATE_IN_FUTURE_96_XX
-
-# good enough
-DATE_IN_PAST_06_12
-
-# good enough
-DATE_IN_PAST_96_XX
-
-# good enough
-DATE_SPAMWARE_Y2K
+CTYPE_1SPACE_GIF
 
 # good enough
 DC_GIF_MULTI_LARGO
@@ -121,9 +85,6 @@
 # tflags net
 DIGEST_MULTIPLE
 
-# good enough
-DISGUISE_PORN_MUNDANE
-
 # tflags net
 DNS_FROM_AHBL_RHSBL
 
@@ -146,9 +107,6 @@
 DNS_FROM_SECURITYSAGE
 
 # good enough
-DOS_DOUBLE_SOTCK
-
-# good enough
 DOS_LET_GO_JOB
 
 # good enough
@@ -164,81 +122,12 @@
 DOS_YOUR_PLACE
 
 # good enough
-DRUGS_ANXIETY
-
-# good enough
-DRUGS_ANXIETY_EREC
-
-# good enough
-DRUGS_ANXIETY_OBFU
-
-# good enough
-DRUGS_DIET
-
-# good enough
-DRUGS_DIET_OBFU
-
-# good enough
-DRUGS_ERECTILE
-
-# good enough
-DRUGS_ERECTILE_OBFU
-
-# good enough
 DRUGS_HDIA
 
-# good enough
-DRUGS_MANYKINDS
-
-# good enough
-DRUGS_SLEEP_EREC
-
-# good enough
-DRUG_DOSAGE
-
-# good enough
-DRUG_ED_GENERIC
-
-# good enough
-DRUG_ED_ONLINE
-
-# good enough
-DRUG_ED_SILD
-
-# good enough
-EMPTY_MESSAGE
-
-# good enough
-EM_ROLEX
-
 # tflags userconf
 ENV_AND_HDR_SPF_MATCH
 
 # good enough
-EXCUSE_24
-
-# good enough
-EXCUSE_4
-
-# good enough
-EXTRA_MPART_TYPE
-
-# good enough
-FAKE_HELO_LYCOS
-
-# good enough
-FAKE_HELO_MAIL_COM
-
-# good enough
-FAKE_HELO_MAIL_COM_DOM
-
-# good enough
-FAKE_HELO_MSN
-
-# good enough
-FAKE_OUTBLAZE_RCVD
-
-# good enough
 FB_CIALIS_LEO3
 
 # good enough
@@ -254,13 +143,13 @@
 FB_VALIUM_LEO2
 
 # good enough
-FB_VIAGRA_LEO3
+FB_VIAGRA_LEO2
 
 # good enough
-FB_YOURSELF_MASTER
+FB_VIAGRA_LEO3
 
 # good enough
-FH_DATE_IS_19XX
+FB_YOURSELF_MASTER
 
 # good enough
 FH_DATE_PAST_20XX
@@ -278,115 +167,22 @@
 FM_CUSTOMLOGODSGNc
 
 # good enough
-FORGED_AOL_TAGS
-
-# good enough
-FORGED_HOTMAIL_RCVD
-
-# good enough
-FORGED_IMS_HTML
-
-# good enough
-FORGED_IMS_TAGS
-
-# good enough
-FORGED_MSGID_AOL
-
-# good enough
-FORGED_MSGID_EXCITE
-
-# good enough
-FORGED_MSGID_HOTMAIL
-
-# good enough
-FORGED_MSGID_MSN
-
-# good enough
-FORGED_MSGID_YAHOO
+FS_START_DOYOU
 
 # good enough
-FORGED_MUA_AOL_FROM
-
-# good enough
-FORGED_MUA_EUDORA
-
-# good enough
-FORGED_MUA_IMS
-
-# good enough
-FORGED_MUA_MOZILLA
-
-# good enough
-FORGED_MUA_OIMO
-
-# good enough
-FORGED_MUA_OUTLOOK
-
-# good enough
-FORGED_MUA_THEBAT_BOUN
-
-# good enough
-FORGED_OUTLOOK_HTML
-
-# good enough
-FORGED_OUTLOOK_TAGS
-
-# good enough
-FORGED_QUALCOMM_TAGS
-
-# good enough
-FORGED_TELESP_RCVD
-
-# good enough
-FORGED_THEBAT_HTML
-
-# good enough
-FORGED_YAHOO_RCVD
-
-# good enough
-FREE_PORN
-
-# good enough
-FREE_QUOTE_INSTANT
-
-# good enough
-FROM_ALL_NUMS
-
-# good enough
-FROM_BLANK_NAME
-
-# good enough
-FROM_DOMAIN_NOVOWEL
-
-# good enough
-FROM_ENDS_IN_NUMS
-
-# good enough
-FROM_HAS_MIXED_NUMS
-
-# good enough
-FROM_ILLEGAL_CHARS
-
-# good enough
-FROM_LOCAL_DIGITS
-
-# good enough
-FROM_LOCAL_HEX
-
-# good enough
-FROM_LOCAL_NOVOWEL
+FS_START_DOYOU2
 
 # good enough
-FROM_NO_USER
+FUZZY_MERIDIA
 
 # good enough
-FS_START_DOYOU2
+FUZZY_SPRM
 
 # good enough
-FUZZY_MERIDIA
+FUZZY_STOCK
 
 # good enough
-GAPPY_SUBJECT
+FU_HOODIA
 
 # good enough
 GEO_QUERY_STRING
@@ -427,88 +223,40 @@
 # tflags userconf
 HASHCASH_HIGH
 
-# good enough
-HEADER_SPAM
-
-# good enough
-HEAD_ILLEGAL_CHARS
-
-# good enough
-HELO_DYNAMIC_CHELLO_NL
-
-# good enough
-HELO_DYNAMIC_DIALIN
-
-# good enough
-HELO_DYNAMIC_HCC
-
-# good enough
-HELO_DYNAMIC_HEXIP
-
-# good enough
-HELO_DYNAMIC_HOME_NL
+# tflags userconf
+HEAD_LONG
 
 # good enough
-HELO_DYNAMIC_IPADDR
+HS_EXTRA
 
 # good enough
-HELO_DYNAMIC_IPADDR2
+HS_GETMEOFF
 
 # good enough
-HELO_DYNAMIC_SPLIT_IP
+HS_INDEX_PARAM
 
 # good enough
-HG_HORMONE
+HS_MEETUP_FOR_SEX
 
 # good enough
-HS_FORGED_OE_FW
+HS_NO_FLOWERS
 
 # good enough
-HS_GETMEOFF
+HS_PHARMA_1
 
 # good enough
-HS_MEETUP_FOR_SEX
+HS_SUBJ_ONLINE_PHARMACEUTICAL
 
 # good enough
-HS_SUBJ_ONLINE_PHARMACEUTICAL
+HS_SYNDICATE_P2
 
 # tflags userconf
 HTML_CHARSET_FARAWAY
 
 # good enough
-HTTPS_HTTP_MISMATCH
-
-# good enough
-HTTPS_IP_MISMATCH
-
-# good enough
-HTTP_77
-
-# good enough
-HTTP_EXCESSIVE_ESCAPES
-
-# good enough
-INFO_TLD
-
-# good enough
-INVALID_DATE
-
-# good enough
-INVALID_TZ_CST
-
-# good enough
-INVESTMENT_ADVICE
-
-# good enough
-JM_LC_MID
-
-# good enough
 JM_RCVD_QMAILV1
 
 # good enough
-JM_RCVD_SENDMAILID
-
-# good enough
 KAM_STOCKOTC
 
 # good enough
@@ -518,25 +266,19 @@
 KAM_STOCKTIP15
 
 # good enough
-KAM_STOCKTIP21
-
-# good enough
-KAM_STOCKTIP6
-
-# good enough
-KAM_STOCKTIP8
+KAM_STOCKTIP2
 
 # good enough
-KOREAN_UCE_SUBJECT
+KAM_STOCKTIP21
 
 # good enough
-LOCALPART_IN_SUBJECT
+KAM_STOCKTIP3
 
 # good enough
-LONGWORDS
+KAM_STOCKTIP6
 
 # good enough
-MALE_ENHANCE
+KAM_STOCKTIP8
 
 # good enough
 MID_DEGREES
@@ -547,78 +289,12 @@
 # good enough
 MID_OUTLOOK_ZZZNN
 
-# good enough
-MILLION_USD
-
-# good enough
-MIME_BAD_ISO_CHARSET
-
-# good enough
-MIME_BASE64_BLANKS
-
-# good enough
-MIME_BASE64_TEXT
-
-# good enough
-MIME_BOUND_ALLHEX_17
-
-# good enough
-MIME_BOUND_DD_DIGITS
-
-# good enough
-MIME_BOUND_DIGITS_15
-
-# good enough
-MIME_BOUND_MANY_HEX
-
-# good enough
-MISSING_MIMEOLE
-
-# good enough
-MISSING_MIME_HB_SEP
-
-# good enough
-MORE_SEX
-
-# good enough
-MSGID_DOLLARS_RANDOM
-
-# good enough
-MSGID_OUTLOOK_INVALID
-
-# good enough
-MSGID_RANDY
-
-# good enough
-MSGID_SHORT
-
-# good enough
-MSGID_SPAM_CAPS
-
-# good enough
-MSGID_SPAM_LETTERS
-
-# good enough
-MSGID_YAHOO_CAPS
-
-# good enough
-MULTIPART_ALT_NON_TEXT
-
-# good enough
-NOT_ADVISOR
+# tflags userconf
+MISSING_HB_SEP
 
 # tflags net
 NO_DNS_FOR_FROM
 
-# good enough
-NO_PRESCRIPTION
-
-# good enough
-NO_RDNS_DOTCOM_HELO
-
-# tflags userconf
-NO_RECEIVED
-
 # tflags userconf
 NO_RELAYS
 
@@ -626,68 +302,14 @@
 NULL_IN_BODY
 
 # good enough
-NUMERIC_HTTP_ADDR
-
-# good enough
-PERCENT_RANDOM
-
-# good enough
-PLING_QUERY
-
-# good enough
-PORN_15
+PART_CID_STOCK
 
 # good enough
-PREST_NON_ACCREDITED
+PART_CID_STOCK_LESS
 
 # tflags net
 PYZOR_CHECK
 
-# good enough
-RATWARE_EFROM
-
-# good enough
-RATWARE_EGROUPS
-
-# good enough
-RATWARE_GECKO_BUILD
-
-# good enough
-RATWARE_HASH_2
-
-# good enough
-RATWARE_HASH_2_BUG2108
-
-# good enough
-RATWARE_HASH_2_V2
-
-# good enough
-RATWARE_HASH_2_V2_BUG2108
-
-# good enough
-RATWARE_MOZ_MALFORMED
-
-# good enough
-RATWARE_MS_HASH
-
-# good enough
-RATWARE_NAME_ID
-
-# good enough
-RATWARE_OE_MALFORMED
-
-# good enough
-RATWARE_OUTLOOK_NONAME
-
-# good enough
-RATWARE_RCVD_AT
-
-# good enough
-RATWARE_RCVD_PF
-
-# good enough
-RATWARE_ZERO_TZ
-
 # tflags net
 RAZOR2_CF_RANGE_51_100
 
@@ -701,22 +323,7 @@
 RAZOR2_CHECK
 
 # good enough
-RCVD_AM_PM
-
-# good enough
-RCVD_DOUBLE_IP_LOOSE
-
-# good enough
-RCVD_DOUBLE_IP_SPAM
-
-# good enough
-RCVD_FAKE_HELO_DOTCOM
-
-# good enough
-RCVD_HELO_IP_MISMATCH
-
-# good enough
-RCVD_ILLEGAL_IP
+RCVD_FORGED_WROTE
 
 # tflags net
 RCVD_IN_BL_SPAMCOP_NET
@@ -803,41 +410,11 @@
 RCVD_IN_XBL
 
 # good enough
-RCVD_LSO_SND
-
-# good enough
-RCVD_NUMERIC_HELO
-
-# good enough
-REPLICA_WATCH
-
-# good enough
-REPTO_OVERQUOTE_THEBAT
-
-# good enough
-REPTO_QUOTE_AOL
-
-# good enough
-REPTO_QUOTE_IMS
-
-# good enough
-REPTO_QUOTE_MSN
-
-# good enough
-REPTO_QUOTE_QUALCOMM
-
-# good enough
-REPTO_QUOTE_YAHOO
+RCVD_MAIL_COM
 
 # tflags net
 ROUND_THE_WORLD
 
-# good enough
-ROUND_THE_WORLD_LOCAL
-
-# good enough
-SORTED_RECIPS
-
 # tflags net
 SPF_FAIL
 
@@ -862,30 +439,6 @@
 # tflags net
 SPF_SOFTFAIL
 
-# good enough
-SPOOF_COM2OTH
-
-# good enough
-SPOOF_NET2COM
-
-# good enough
-STOCK_ALERT
-
-# good enough
-SUBJECT_DRUG_GAP_C
-
-# good enough
-SUBJECT_DRUG_GAP_L
-
-# good enough
-SUBJECT_DRUG_GAP_S
-
-# good enough
-SUBJECT_DRUG_GAP_VA
-
-# good enough
-SUBJECT_DRUG_GAP_X
-
 # tflags userconf
 SUBJECT_IN_BLACKLIST
 
@@ -896,25 +449,13 @@
 SUBJECT_NEEDS_ENCODING
 
 # good enough
-SUBJECT_SEXUAL
-
-# good enough
-SUBJ_ILLEGAL_CHARS
-
-# good enough
 SUBJ_RE_NUM
 
 # good enough
-SUSPICIOUS_RECIPS
-
-# good enough
-TO_MALFORMED
-
-# good enough
-TO_NO_USER
+TT_MSGID_TRUNC
 
 # good enough
-TT_MSGID_TRUNC
+TT_OBSCURED_VIAGRA
 
 # good enough
 TVD_ACT_193
@@ -923,21 +464,27 @@
 TVD_APP_LOAN
 
 # good enough
-TVD_DEAR_HOMEOWNER
+TVD_BODY_END_STAR
 
 # good enough
-TVD_DOLLARS_US
+TVD_DEAR_HOMEOWNER
 
 # good enough
 TVD_EB_PHISH
 
 # good enough
-TVD_FINGER_02
+TVD_ENHANCE
+
+# good enough
+TVD_FINGER_01
 
 # good enough
 TVD_FLOAT_GENERAL
 
 # good enough
+TVD_FROM_1
+
+# good enough
 TVD_FUZZY_DEGREE
 
 # good enough
@@ -953,18 +500,24 @@
 TVD_FUZZY_PHARMACEUTICAL
 
 # good enough
+TVD_FUZZY_SECURITIES
+
+# good enough
 TVD_FUZZY_SYMBOL
 
 # good enough
-TVD_FW_GRAPHIC_ID3
+TVD_FW_GRAPHIC_ID1
 
 # good enough
-TVD_FW_GRAPHIC_ID3_2
+TVD_FW_GRAPHIC_ID2
 
 # good enough
 TVD_FW_MESG1
 
 # good enough
+TVD_FW_MESG2
+
+# good enough
 TVD_GET_STOCK
 
 # good enough
@@ -974,13 +527,10 @@
 TVD_INCREASE_SIZE
 
 # good enough
-TVD_LINK_SAVE
-
-# good enough
 TVD_NOT_SATISFIED
 
 # good enough
-TVD_PH_7
+TVD_PH_BODY_META
 
 # good enough
 TVD_PH_FR5
@@ -989,33 +539,21 @@
 TVD_PH_REC
 
 # good enough
-TVD_PH_SUBJ_ACCOUNTS_POST
-
-# good enough
-TVD_PH_SUBJ_META
-
-# good enough
-TVD_PH_SUBJ_META_ALL
-
-# good enough
-TVD_PH_SUBJ_SEC_MEASURES
-
-# good enough
 TVD_PH_SUBJ_UPDATE
 
 # good enough
 TVD_PH_SUBJ_URGENT
 
 # good enough
-TVD_PP_PHISH
-
-# good enough
 TVD_QUAL_MEDS
 
 # good enough
 TVD_RATWARE_CB
 
 # good enough
+TVD_RATWARE_CB_2
+
+# good enough
 TVD_RATWARE_MSGID_01
 
 # good enough
@@ -1037,9 +575,6 @@
 TVD_SPACED_WORDS
 
 # good enough
-TVD_SPACE_RATIO
-
-# good enough
 TVD_STOCK1
 
 # good enough
@@ -1055,9 +590,6 @@
 TVD_SUBJ_FINGER_03
 
 # good enough
-TVD_SUBJ_FINGER_04
-
-# good enough
 TVD_SUBJ_OWE
 
 # good enough
@@ -1067,50 +599,41 @@
 TVD_UA_FOSTERING
 
 # good enough
-TVD_VIS_HIDDEN
+TVD_UNDER_VALUED
 
 # good enough
-FORGED_IMS_HTML
+TVD_VIS_HIDDEN
 
 # good enough
-FORGED_IMS_TAGS
+DOS_TO_READ_STOCK
 
 # good enough
-FORGED_OUTLOOK_HTML
+DRUGS_STOCK_MIMEOLE
 
 # good enough
-FORGED_OUTLOOK_TAGS
+DRUGS_STOCK_MIMEOLE2
 
 # good enough
-RATWARE_MS_HASH
+FB_CIALIS_LEO2
 
 # good enough
-RATWARE_OUTLOOK_NONAME
+FH_DATE_IS_19XX
 
 # good enough
-TVD_FW_GRAPHIC_ID1
+FR_WWW_DOMAIN_23SUBDIR
 
 # good enough
-TVD_FW_GRAPHIC_ID2
+KAM_STOCKTIP20
 
 # good enough
-UNCLAIMED_MONEY
+TVD_LINK_SAVE
 
 # good enough
-UNCLOSED_BRACKET
+TVD_PH_SUBJ_SEC_MEASURES
 
 # tflags userconf
 UNPARSEABLE_RELAY
 
-# good enough
-UNRESOLVED_TEMPLATE
-
-# good enough
-UPPERCASE_50_75
-
-# good enough
-UPPERCASE_75_100
-
 # tflags net
 URIBL_AB_SURBL
 
@@ -1144,15 +667,6 @@
 # good enough
 URI_L_PHP
 
-# good enough
-URI_NOVOWEL
-
-# good enough
-URI_NO_WWW_BIZ_CGI
-
-# good enough
-URI_NO_WWW_INFO_CGI
-
 # tflags userconf
 USER_IN_ALL_SPAM_TO
 
@@ -1184,34 +698,31 @@
 VERTICAL_DRUGS_1
 
 # good enough
-VIA_GAP_GRA
+VERTICAL_WORDS_1
 
 # good enough
-WEIRD_QUOTING
-
-# good enough
-X_IP
+ZMIde_EBAYJOBSURI
 
 # good enough
-X_LIBRARY
+ZMIde_GIRLSRCH1
 
 # good enough
-X_MESSAGE_INFO
+ZMIde_GIRLSRCH2
 
 # good enough
-X_MSMAIL_PRIORITY_HIGH
+ZMIde_LOVEGALX1
 
 # good enough
-X_ORIG_IP_NOT_IPV4
+ZMIde_LOVEGALX2
 
 # good enough
-X_PRIORITY_CC
+ZMIde_LOVEGALXURI
 
 # good enough
-ZMIde_EBAYJOBSURI
+ZMIde_SEXUALEXPL1
 
 # good enough
-ZMIde_SEXUALEXPL1
+ZMIde_URIPORNWEB
 
 # tflags net
 __RCVD_IN_IADB

Modified: spamassassin/branches/jm_re2c_hacks/rules/regression_tests.cf
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/rules/regression_tests.cf?view=diff&rev=467701&r1=467700&r2=467701
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/rules/regression_tests.cf (original)
+++ spamassassin/branches/jm_re2c_hacks/rules/regression_tests.cf Wed Oct 25 09:15:31 2006
@@ -27,8 +27,6 @@
 test DEAR_FRIEND ok Dear friend,
 test DEAR_FRIEND fail Dear Mr. Ithacus,
 
-test FROM_ENDS_IN_NUMS     ok   matt12345678@sergeant.org
-test FROM_ENDS_IN_NUMS     fail matt@sergeant.org
 test FROM_STARTS_WITH_NUMS     ok   12345678matt@sergeant.org
 test FROM_STARTS_WITH_NUMS     fail matt@sergeant.org
 test FORGED_YAHOO_RCVD fail by mf1.lng.yahoo.com (8.11.1/8.11.1) id g3SDfPH19426

Modified: spamassassin/branches/jm_re2c_hacks/rules/rule2xs.pre
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/rules/rule2xs.pre?view=diff&rev=467701&r1=467700&r2=467701
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/rules/rule2xs.pre (original)
+++ spamassassin/branches/jm_re2c_hacks/rules/rule2xs.pre Wed Oct 25 09:15:31 2006
@@ -1,4 +1,4 @@
 
-# loadplugin Mail::SpamAssassin::Plugin::BodyRuleBaseExtractor
+loadplugin Mail::SpamAssassin::Plugin::BodyRuleBaseExtractor
 loadplugin Mail::SpamAssassin::Plugin::Rule2XSBody
 

Modified: spamassassin/branches/jm_re2c_hacks/sa-learn.raw
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/sa-learn.raw?view=diff&rev=467701&r1=467700&r2=467701
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/sa-learn.raw (original)
+++ spamassassin/branches/jm_re2c_hacks/sa-learn.raw Wed Oct 25 09:15:31 2006
@@ -85,7 +85,8 @@
 %opt = (
   'force-expire' => 0,
   'use-ignores'  => 0,
-  'nosync'    => 0,
+  'nosync'       => 0,
+  'cf'           => []
 );
 
 Getopt::Long::Configure(
@@ -104,6 +105,7 @@
   'configpath|config-file|config-dir|c|C=s' => \$opt{'configpath'},
   'prefspath|prefs-file|p=s'                => \$opt{'prefspath'},
   'siteconfigpath=s'                        => \$opt{'siteconfigpath'},
+  'cf=s'                                    => \@{$opt{'cf'}},
 
   'folders|f=s'          => \$opt{'folders'},
   'force-expire|expire'  => \$opt{'force-expire'},
@@ -216,6 +218,8 @@
   $post_config .= "use_bayes 1\n";
 }
 
+$post_config .= join("\n", @{$opt{'cf'}})."\n";
+
 # create the tester factory
 $spamtest = new Mail::SpamAssassin(
   {
@@ -585,6 +589,7 @@
  -C path, --configpath=path, --config-file=path   Path to standard configuration dir
  -p prefs, --prefspath=file, --prefs-file=file    Set user preferences file
  --siteconfigpath=path             Path for site configs (def: /etc/mail/spamassassin)
+ --cf='config line'                Additional line of configuration
  -D, --debug-level                 Print debugging messages
  -V, --version                     Print version
  -h, --help                        Print usage message
@@ -738,6 +743,12 @@
 
 Use the specified path for locating site-specific configuration files.  Ignore
 the default directories (usually C</etc/mail/spamassassin> or similar).
+
+=item B<--cf='config line'>
+
+Add additional lines of configuration directly from the command-line, parsed
+after the configuration files are read.   Multiple B<--cf> arguments can be
+used, and each will be considered a separate line of configuration.
 
 =item B<-p> I<prefs>, B<--prefspath>=I<prefs>, B<--prefs-file>=I<prefs>
 

Modified: spamassassin/branches/jm_re2c_hacks/sa-update.raw
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/sa-update.raw?view=diff&rev=467701&r1=467700&r2=467701
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/sa-update.raw (original)
+++ spamassassin/branches/jm_re2c_hacks/sa-update.raw Wed Oct 25 09:15:31 2006
@@ -362,19 +362,6 @@
   }
 }
 
-# --lint check the current site config before we download any updates so that
-# a site with a broken config, with sa-update in a cron job, doesn't hammer
-# the update servers continously downloading and then aborting the update when
-# the lint check of the update (with the site config included) fails.
-# Wait until now to do it since nothing above depends on a successful --lint.
-
-if (!lint_check_dir(File::Spec->catfile($opt{'updatedir'}, "doesnotexist"))) {
-  warn "error: lint check of current site config failed, cannot continue\n";
-  dbg("diag: local site config must successfully lint before doing updates, ".
-	"exiting with code 2");
-  exit 2;
-}
-
 my $res = Net::DNS::Resolver->new();
 
 my $ua = LWP::UserAgent->new();
@@ -1252,6 +1239,7 @@
   # "config" or otherwise be more terse. :(
   my $spamtest = new Mail::SpamAssassin( {
     rules_filename      => $dir,
+    site_rules_filename => File::Spec->catfile($dir, "doesnotexist"),
     userprefs_filename  => File::Spec->catfile($dir, "doesnotexist"),
 
     local_tests_only    => 1,
@@ -1438,8 +1426,6 @@
 installed successfully.
 
 An exit code of C<1> means no fresh updates were available.
-
-An exit code of C<2> means that a lint check of the current site config failed.
 
 An exit code of C<4> or higher, indicates that errors occurred while
 attempting to download and extract updates.

Modified: spamassassin/branches/jm_re2c_hacks/spamassassin.raw
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/spamassassin.raw?view=diff&rev=467701&r1=467700&r2=467701
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/spamassassin.raw (original)
+++ spamassassin/branches/jm_re2c_hacks/spamassassin.raw Wed Oct 25 09:15:31 2006
@@ -152,7 +152,7 @@
 # - create user preference files
 # - have ArchiveIterator detect the input message format (file vs dir)
 #
-my %opt = ( 'create-prefs' => 1, 'format' => 'detect' );
+my %opt = ( 'create-prefs' => 1, 'format' => 'detect', cf => [] );
 
 my $doing_whitelist_operation = 0;
 my $count                     = 0;
@@ -174,6 +174,7 @@
   'add-to-whitelist|W'                      => \$opt{'add-to-whitelist'},
   'configpath|config-file|config-dir|c|C=s' => \$opt{'configpath'},
   'create-prefs!'                           => \$opt{'create-prefs'},
+  'cf=s'                                    => \@{$opt{'cf'}},
   'debug|D:s'                               => \$opt{'debug'},
   'error-code|exit-code|e:i'                => \$opt{'error-code'},
   'help|h|?'                                => \$opt{'help'},
@@ -250,6 +251,7 @@
     local_tests_only    => $opt{'local'},
     debug               => $opt{'debug'},
     dont_copy_prefs     => ( $opt{'create-prefs'} ? 0 : 1 ),
+    post_config_text    => join("\n", @{$opt{'cf'}})."\n",
     PREFIX              => $PREFIX,
     DEF_RULES_DIR       => $DEF_RULES_DIR,
     LOCAL_RULES_DIR     => $LOCAL_RULES_DIR,

Modified: spamassassin/branches/jm_re2c_hacks/spamc/configure
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/spamc/configure?view=diff&rev=467701&r1=467700&r2=467701
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/spamc/configure (original)
+++ spamassassin/branches/jm_re2c_hacks/spamc/configure Wed Oct 25 09:15:31 2006
@@ -2231,9 +2231,10 @@
 ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
 
-if test "x$GCC" = "xyes" ; then
-  CFLAGS="-Wall -Wextra -Wdeclaration-after-statement $CFLAGS"
-fi
+# off: breaks gcc 3.3.x!  doh
+# if test "x$GCC" = "xyes" ; then
+# CFLAGS="-Wall -Wextra -Wdeclaration-after-statement $CFLAGS"
+# fi
 
 
 

Modified: spamassassin/branches/jm_re2c_hacks/spamc/configure.in
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/spamc/configure.in?view=diff&rev=467701&r1=467700&r2=467701
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/spamc/configure.in (original)
+++ spamassassin/branches/jm_re2c_hacks/spamc/configure.in Wed Oct 25 09:15:31 2006
@@ -17,9 +17,10 @@
 
 AC_PROG_CC
 
-if test "x$GCC" = "xyes" ; then
-  CFLAGS="-Wall -Wextra -Wdeclaration-after-statement $CFLAGS"
-fi
+# off: breaks gcc 3.3.x!  doh
+# if test "x$GCC" = "xyes" ; then
+# CFLAGS="-Wall -Wextra -Wdeclaration-after-statement $CFLAGS"
+# fi
 
 AC_EXEEXT
 

Modified: spamassassin/branches/jm_re2c_hacks/spamd/spamd.raw
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/spamd/spamd.raw?view=diff&rev=467701&r1=467700&r2=467701
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/spamd/spamd.raw (original)
+++ spamassassin/branches/jm_re2c_hacks/spamd/spamd.raw Wed Oct 25 09:15:31 2006
@@ -170,6 +170,7 @@
   'min-children'  => 1,         # min kids to have running
   'min-spare'     => 1,         # min kids that must be spare
   'max-spare'     => 2,         # max kids that should be spare
+  'cf'            => [],        # extra config lines
 );
 
 
@@ -236,6 +237,7 @@
   'setuid-with-ldap'         => \$opt{'setuid-with-ldap'},
   'setuid-with-sql'          => \$opt{'setuid-with-sql'},
   'siteconfigpath=s'         => \$opt{'siteconfigpath'},
+  'cf=s'                     => \@{$opt{'cf'}},
   'socketgroup=s'            => \$opt{'socketgroup'},
   'socketmode=s'             => \$opt{'socketmode'},
   'socketowner=s'            => \$opt{'socketowner'},
@@ -731,6 +733,7 @@
     dont_copy_prefs      => $dontcopy,
     rules_filename       => ( $opt{'configpath'} || 0 ),
     site_rules_filename  => ( $opt{'siteconfigpath'} || 0 ),
+    post_config_text     => join("\n", @{$opt{'cf'}})."\n",
     force_ipv4           => ( $opt{'force_ipv4'} || 0 ),
     local_tests_only     => ( $opt{'local'} || 0 ),
     debug                => ( $opt{'debug'} || 0 ),
@@ -2216,6 +2219,7 @@
  -c, --create-prefs                 Create user preferences files
  -C path, --configpath=path         Path for default config files
  --siteconfigpath=path              Path for site configs
+ --cf='config line'                 Additional line of configuration
  -d, --daemonize                    Daemonize
  -h, --help                         Print usage message.
  -i [ipaddr], --listen-ip=ipaddr    Listen on the IP ipaddr
@@ -2319,6 +2323,12 @@
 
 Use the specified path for locating site-specific configuration files.  Ignore
 the default directories (usually C</etc/mail/spamassassin> or similar).
+
+=item B<--cf='config line'>
+
+Add additional lines of configuration directly from the command-line, parsed
+after the configuration files are read.   Multiple B<--cf> arguments can be
+used, and each will be considered a separate line of configuration.
 
 =item B<-d>, B<--daemonize>
 

Modified: spamassassin/branches/jm_re2c_hacks/t/bayesdbm.t
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/t/bayesdbm.t?view=diff&rev=467701&r1=467700&r2=467701
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/t/bayesdbm.t (original)
+++ spamassassin/branches/jm_re2c_hacks/t/bayesdbm.t Wed Oct 25 09:15:31 2006
@@ -63,7 +63,8 @@
 
 # $msgid is the generated hash messageid
 # $msgid_hdr is the Message-Id header
-ok($msgid eq 'ce33e4a8bc5798c65428d6018380bae346c7c126@sa_generated');
+ok($msgid eq 'ce33e4a8bc5798c65428d6018380bae346c7c126@sa_generated')
+    or warn "got: [$msgid]";
 ok($msgid_hdr eq '9PS291LhupY');
 
 ok($sa->{bayes_scanner}->{store}->tie_db_writable());

Modified: spamassassin/branches/jm_re2c_hacks/t/mimeheader.t
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/t/mimeheader.t?view=diff&rev=467701&r1=467700&r2=467701
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/t/mimeheader.t (original)
+++ spamassassin/branches/jm_re2c_hacks/t/mimeheader.t Wed Oct 25 09:15:31 2006
@@ -2,7 +2,7 @@
 
 use lib '.'; use lib 't';
 use SATest; sa_t_init("mimeheader");
-use Test; BEGIN { plan tests => 2 };
+use Test; BEGIN { plan tests => 4 };
 
 $ENV{'LANGUAGE'} = $ENV{'LC_ALL'} = 'C';             # a cheat, but we need the patterns to work
 
@@ -12,6 +12,8 @@
 
   q{ MIMEHEADER_TEST1 }, q{ test1 },
   q{ MIMEHEADER_TEST2 }, q{ test2 },
+  q{ MATCH_NL_NONRAW }, q{ match_nl_nonraw },
+  q{ MATCH_NL_RAW }, q{ match_nl_raw },
 
 );
 
@@ -20,6 +22,9 @@
   # loadplugin Mail::SpamAssassin::Plugin::MIMEHeader
   mimeheader MIMEHEADER_TEST1 content-type =~ /application\/msword/
   mimeheader MIMEHEADER_TEST2 content-type =~ m!APPLICATION/MSWORD!i
+
+  mimeheader MATCH_NL_NONRAW       Content-Type =~ /msword; name/
+  mimeheader MATCH_NL_RAW   Content-Type:raw =~ /msword;\n\tname/
 
 	});
 

Modified: spamassassin/branches/jm_re2c_hacks/t/missing_hb_separator.t
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/t/missing_hb_separator.t?view=diff&rev=467701&r1=467700&r2=467701
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/t/missing_hb_separator.t (original)
+++ spamassassin/branches/jm_re2c_hacks/t/missing_hb_separator.t Wed Oct 25 09:15:31 2006
@@ -42,6 +42,7 @@
 
 $result = 0;
 foreach (@{$status->{test_names_hit}}) {
+  print "test hit: $_\n";
   $result++ if ($_ eq 'MISSING_HB_SEP' || $_ eq 'X_MESSAGE_INFO');
 }
 

Modified: spamassassin/branches/jm_re2c_hacks/t/mkrules.t
URL: http://svn.apache.org/viewvc/spamassassin/branches/jm_re2c_hacks/t/mkrules.t?view=diff&rev=467701&r1=467700&r2=467701
==============================================================================
--- spamassassin/branches/jm_re2c_hacks/t/mkrules.t (original)
+++ spamassassin/branches/jm_re2c_hacks/t/mkrules.t Wed Oct 25 09:15:31 2006
@@ -2,7 +2,7 @@
 
 use lib '.'; use lib 't';
 use SATest; sa_t_init("mkrules");
-use Test; BEGIN { plan tests => 96 };
+use Test; BEGIN { plan tests => 101 };
 use File::Path;
 
 # ---------------------------------------------------------------------------
@@ -417,6 +417,36 @@
 # checkfile("$tdir/rules/72_active.cf", \&patterns_run_cb);
 checkfile("$tdir/rules/70_sandbox.cf", \&patterns_run_cb);
 ok (-f "$tdir/rules/plugin.pm");
+ok ok_all_patterns();
+save_tdir();
+
+# ---------------------------------------------------------------------------
+print "meta rule depends on unpromoted subrule in lexically-earlier file\n\n";
+# (see mail from Sidney of Oct 16 2006, rules HS_INDEX_PARAM and HS_PHARMA_1)
+
+%patterns = (
+  "header T_GOOD_SUB"   => rule_line_1,
+  "header T_BAD_SUB"   => rule_line_2,
+  "meta GOOD (T_GOOD_SUB && !T_BAD_SUB)" => meta_found
+);
+%anti_patterns = (
+);
+
+rmtree([ $tdir ]); mkpath ([ "$tdir/rulesrc/sandbox/foo", "$tdir/rules" ]);
+
+write_file("$tdir/MANIFEST", [ "rules/72_active.cf\n" ]);
+write_file("$tdir/MANIFEST.SKIP", [ ]);
+write_file("$tdir/rules/active.list", [ "GOOD\n" ]);
+write_file("$tdir/rulesrc/sandbox/foo/20_aaa.cf", [
+    "meta GOOD (GOOD_SUB && !BAD_SUB)\n",
+]);
+write_file("$tdir/rulesrc/sandbox/foo/20_bbb.cf", [
+    "header GOOD_SUB Foo =~ /good/\n",
+    "header BAD_SUB Foo =~ /bad/\n",
+]);
+
+ok (mkrun ("--src $tdir/rulesrc --out $tdir/rules --manifest $tdir/MANIFEST --manifestskip $tdir/MANIFEST.SKIP --active $tdir/rules/active.list 2>&1", \&patterns_run_cb));
+checkfile("$tdir/rules/72_active.cf", \&patterns_run_cb);
 ok ok_all_patterns();
 save_tdir();