You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by qu...@apache.org on 2004/11/21 23:37:45 UTC

svn commit: r106134 - /spamassassin/trunk/rules/20_body_tests.cf /spamassassin/trunk/rules/70_testing.cf

Author: quinlan
Date: Sun Nov 21 14:37:44 2004
New Revision: 106134

Modified:
   spamassassin/trunk/rules/20_body_tests.cf
   spamassassin/trunk/rules/70_testing.cf
Log:
lower FP version of LONGWORDS, also 3 subrules instead of 4 so faster;
try a different version of LONGWORDS to catch use of dictionaries that include
capitalized words


Modified: spamassassin/trunk/rules/20_body_tests.cf
Url: http://svn.apache.org/viewcvs/spamassassin/trunk/rules/20_body_tests.cf?view=diff&rev=106134&p1=spamassassin/trunk/rules/20_body_tests.cf&r1=106133&p2=spamassassin/trunk/rules/20_body_tests.cf&r2=106134
==============================================================================
--- spamassassin/trunk/rules/20_body_tests.cf	(original)
+++ spamassassin/trunk/rules/20_body_tests.cf	Sun Nov 21 14:37:44 2004
@@ -154,12 +154,9 @@
 body DOMAIN_RATIO	eval:check_domain_ratio('0.022')
 describe DOMAIN_RATIO	Message body mentions many internet domains
 
-# If these are too expensive as a whole, then delete __LONGWORDS_B and
-# __LONGWORDS_C and replace with (__LONGWORDS_D || __LONGWORDS_A) which
-# is very close in quality.
+# this could use more work
 body __LONGWORDS_A	/\b(?:[a-z]{8,}\s+){6}/
-body __LONGWORDS_B	/\b(?:[a-z]{7,}\s+){8}/
-body __LONGWORDS_C	/\b(?:[a-z]{6,}\s+){9}/
-body __LONGWORDS_D	/\b(?:[a-z]{5,}\s+){10}/
-meta LONGWORDS		(__LONGWORDS_A || __LONGWORDS_B || __LONGWORDS_C || __LONGWORDS_D)
+body __LONGWORDS_B	/\b(?:[a-z]{6,}\s+){9}/
+body __LONGWORDS_C	/\b(?:[a-z]{5,}\s+){10}/
+meta LONGWORDS		(__LONGWORDS_A + __LONGWORDS_B + __LONGWORDS_C > 1)
 describe LONGWORDS	Long string of long words

Modified: spamassassin/trunk/rules/70_testing.cf
Url: http://svn.apache.org/viewcvs/spamassassin/trunk/rules/70_testing.cf?view=diff&rev=106134&p1=spamassassin/trunk/rules/70_testing.cf&r1=106133&p2=spamassassin/trunk/rules/70_testing.cf&r2=106134
==============================================================================
--- spamassassin/trunk/rules/70_testing.cf	(original)
+++ spamassassin/trunk/rules/70_testing.cf	Sun Nov 21 14:37:44 2004
@@ -205,24 +205,11 @@
 body T_HTML_EXTRA_CLOSE_E	eval:html_range('extra_close_ratio', '0.08', '0.16')
 body T_HTML_EXTRA_CLOSE_F	eval:html_range('extra_close_ratio', '0.16', '0.32')
 
-# bug 3529 - LONGWORDS false positives
-# fast version, but probably less accurate
-meta T_LONGWORDS_01	(__LONGWORDS_A || __LONGWORDS_B)
-meta T_LONGWORDS_02	(__LONGWORDS_A || __LONGWORDS_C)
-meta T_LONGWORDS_03	(__LONGWORDS_A || __LONGWORDS_D)
-meta T_LONGWORDS_04	(__LONGWORDS_B || __LONGWORDS_C)
-meta T_LONGWORDS_05	(__LONGWORDS_B || __LONGWORDS_D)
-meta T_LONGWORDS_06	(__LONGWORDS_C || __LONGWORDS_D)
-meta T_LONGWORDS_07	(__LONGWORDS_A || __LONGWORDS_B || __LONGWORDS_C)
-meta T_LONGWORDS_08	(__LONGWORDS_A || __LONGWORDS_B || __LONGWORDS_D)
-meta T_LONGWORDS_09	(__LONGWORDS_A || __LONGWORDS_C || __LONGWORDS_D)
-meta T_LONGWORDS_10	(__LONGWORDS_B || __LONGWORDS_C || __LONGWORDS_D)
-meta T_LONGWORDS_11	(__LONGWORDS_A + __LONGWORDS_B + __LONGWORDS_C > 1)
-meta T_LONGWORDS_12	(__LONGWORDS_A + __LONGWORDS_B + __LONGWORDS_D > 1)
-meta T_LONGWORDS_13	(__LONGWORDS_A + __LONGWORDS_C + __LONGWORDS_D > 1)
-meta T_LONGWORDS_14	(__LONGWORDS_B + __LONGWORDS_C + __LONGWORDS_D > 1)
-meta T_LONGWORDS_15	(__LONGWORDS_A + __LONGWORDS_B + __LONGWORDS_C + __LONGWORDS_D > 1)
-meta T_LONGWORDS_16	(__LONGWORDS_A + __LONGWORDS_B + __LONGWORDS_C + __LONGWORDS_D > 2)
+# try allowing uppercase for the first letter
+body __T_LONGWORDS_A	/\b(?:[A-Za-z][a-z]{7,}\s+){6}/
+body __T_LONGWORDS_B	/\b(?:[A-Za-z][a-z]{5,}\s+){9}/
+body __T_LONGWORDS_C	/\b(?:[A-Za-z][a-z]{4,}\s+){10}/
+meta T_LONGWORDS	(__T_LONGWORDS_A + __T_LONGWORDS_B + __T_LONGWORDS_C > 1)
 
 ##########################################################################
 # bug 2843