You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2004/08/05 05:06:02 UTC

svn commit: rev 35712 - in spamassassin/trunk: . build lib/Mail lib/Mail/SpamAssassin/Plugin masses rules

Author: jm
Date: Wed Aug  4 20:06:01 2004
New Revision: 35712

Modified:
   spamassassin/trunk/Changes
   spamassassin/trunk/build/README
   spamassassin/trunk/lib/Mail/SpamAssassin.pm
   spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Hashcash.pm
   spamassassin/trunk/masses/parse-rules-for-masses
   spamassassin/trunk/masses/rewrite-cf-with-new-scores
   spamassassin/trunk/masses/score-ranges-from-freqs
   spamassassin/trunk/rules/50_scores.cf
Log:
Preparing to release 3.0.0-pre4

Modified: spamassassin/trunk/Changes
==============================================================================
--- spamassassin/trunk/Changes	(original)
+++ spamassassin/trunk/Changes	Wed Aug  4 20:06:01 2004
@@ -1,4 +1,235 @@
 ------------------------------------------------------------------------
+r35708 | quinlan | 2004-08-05 01:27:22 +0000 (Thu, 05 Aug 2004) | 2 lines
+
+use proper name of license: Apache License, Version 2.0
+
+------------------------------------------------------------------------
+r35707 | quinlan | 2004-08-05 01:25:13 +0000 (Thu, 05 Aug 2004) | 2 lines
+
+add COPYRIGHT note (trying to hit the major top-level documents)
+
+------------------------------------------------------------------------
+r35706 | quinlan | 2004-08-05 01:24:06 +0000 (Thu, 05 Aug 2004) | 2 lines
+
+update URLs in various places
+
+------------------------------------------------------------------------
+r35705 | jm | 2004-08-05 01:14:25 +0000 (Thu, 05 Aug 2004) | 1 line
+
+removed out-of-date copyright notice; now replaced by stuff in LICENSE, CREDITS and NOTICE
+------------------------------------------------------------------------
+r35704 | quinlan | 2004-08-05 00:58:43 +0000 (Thu, 05 Aug 2004) | 2 lines
+
+remove my copyright
+
+------------------------------------------------------------------------
+r35695 | mss | 2004-08-04 19:37:17 +0000 (Wed, 04 Aug 2004) | 2 lines
+
+Some further man page tweaking.
+
+------------------------------------------------------------------------
+r35694 | mss | 2004-08-04 19:14:13 +0000 (Wed, 04 Aug 2004) | 2 lines
+
+bug 3665: reordered the man page chapters so they follow the common order SYNOPSIS->DESCRIPTION->OPTIONS->OTHERS
+
+------------------------------------------------------------------------
+r35685 | parker | 2004-08-04 14:46:40 +0000 (Wed, 04 Aug 2004) | 1 line
+
+Bug 3656: Fix broken --backup
+------------------------------------------------------------------------
+r35673 | sidney | 2004-08-04 05:28:53 +0000 (Wed, 04 Aug 2004) | 1 line
+
+bug 3638: make test errors in Windows and bug 3639: spamc tests skipped under Windows unless started in t directory
+------------------------------------------------------------------------
+r35667 | jm | 2004-08-04 04:27:22 +0000 (Wed, 04 Aug 2004) | 1 line
+
+another doco fix
+------------------------------------------------------------------------
+r35666 | jm | 2004-08-04 04:26:34 +0000 (Wed, 04 Aug 2004) | 1 line
+
+doco fix
+------------------------------------------------------------------------
+r35662 | jm | 2004-08-04 03:28:01 +0000 (Wed, 04 Aug 2004) | 1 line
+
+bug 3627: patch 2195 applied; the new rewrite-cf-with-new-scores will add a score for AWL.  this seems to break the whitelist_addrs.t test, so removed.  omit scores for lang xx locale-specific rules, otherwise 'make test' fails.   sets 'tflags net' rules scores to 0 for scoresets 0 and 2, instead of defaulting them to 1 (which makes no sense).
+------------------------------------------------------------------------
+r35661 | quinlan | 2004-08-04 03:22:38 +0000 (Wed, 04 Aug 2004) | 2 lines
+
+bug 3627: separate mutable rules from immutable rules using division
+
+------------------------------------------------------------------------
+r35660 | hstern | 2004-08-04 03:03:59 +0000 (Wed, 04 Aug 2004) | 5 lines
+
+* validate-model
+  Redirected stderr from fp-fn-statistics to /dev/null to avoid all of the spam
+  from running the validation set against set0/2.
+
+
+------------------------------------------------------------------------
+r35659 | hstern | 2004-08-04 03:00:21 +0000 (Wed, 04 Aug 2004) | 3 lines
+
+Trivial floating point arithmetic fix.
+
+
+------------------------------------------------------------------------
+r35621 | quinlan | 2004-08-03 07:14:39 +0000 (Tue, 03 Aug 2004) | 2 lines
+
+bug 3634: performance improvements
+
+------------------------------------------------------------------------
+r35614 | quinlan | 2004-08-03 02:47:32 +0000 (Tue, 03 Aug 2004) | 2 lines
+
+various performance improvements, long header test
+
+------------------------------------------------------------------------
+r35585 | quinlan | 2004-08-02 09:53:17 +0000 (Mon, 02 Aug 2004) | 2 lines
+
+more documentation
+
+------------------------------------------------------------------------
+r35584 | quinlan | 2004-08-02 09:51:30 +0000 (Mon, 02 Aug 2004) | 2 lines
+
+documentation tweak
+
+------------------------------------------------------------------------
+r35550 | quinlan | 2004-08-01 22:04:35 +0000 (Sun, 01 Aug 2004) | 2 lines
+
+documentation fix
+
+------------------------------------------------------------------------
+r31067 | felicity | 2004-08-01 00:18:24 +0000 (Sun, 01 Aug 2004) | 1 line
+
+bug 3651: if calling compile_now(), the available Bayes DB will remain tied.  a warning will then pop up (sanity_check_untie) before untieing.  so explicitly untie the DB when we're finished.
+------------------------------------------------------------------------
+r31066 | quinlan | 2004-07-31 23:42:11 +0000 (Sat, 31 Jul 2004) | 2 lines
+
+add scantime parameter to logs
+
+------------------------------------------------------------------------
+r31033 | quinlan | 2004-07-31 09:28:10 +0000 (Sat, 31 Jul 2004) | 3 lines
+
+trivial speed-up, doing s/^\s+|\s+$//g; is always much slower than using
+two replacements
+
+------------------------------------------------------------------------
+r30966 | quinlan | 2004-07-30 05:47:29 +0000 (Fri, 30 Jul 2004) | 2 lines
+
+add -i flag to ignore leading data (handy for using tail on a mbox)
+
+------------------------------------------------------------------------
+r30960 | sidney | 2004-07-30 02:37:01 +0000 (Fri, 30 Jul 2004) | 1 line
+
+Spillchucker had insufficient magic
+------------------------------------------------------------------------
+r30959 | sidney | 2004-07-30 02:33:38 +0000 (Fri, 30 Jul 2004) | 1 line
+
+Update build and test instructions for spamc under Windows to reflect changes we made
+------------------------------------------------------------------------
+r30957 | parker | 2004-07-30 02:11:05 +0000 (Fri, 30 Jul 2004) | 1 line
+
+Bug 3640: Clear current_user variable so handle_sql_user will be called when no User: header present
+------------------------------------------------------------------------
+r30954 | felicity | 2004-07-30 00:26:15 +0000 (Fri, 30 Jul 2004) | 1 line
+
+bug 3644: rewrite_header changes parens in the rewrite section to brackets.  we should do that for the From and To fields, to avoid any issues with comment parsing, but leave Subject alone.
+------------------------------------------------------------------------
+r30927 | parker | 2004-07-29 14:02:53 +0000 (Thu, 29 Jul 2004) | 1 line
+
+Bug 3628: Skip test when Storable is not installed
+------------------------------------------------------------------------
+r30889 | quinlan | 2004-07-29 02:27:05 +0000 (Thu, 29 Jul 2004) | 3 lines
+
+trivial speed-up for UNRESOLVED_TEMPLATE (removes * and +, uses {n,m}
+instead with no loss of hits on my spam corpus)
+
+------------------------------------------------------------------------
+r30880 | jm | 2004-07-28 21:43:39 +0000 (Wed, 28 Jul 2004) | 1 line
+
+as requested, configure rebuilt with autoconf 2.59
+------------------------------------------------------------------------
+r30879 | quinlan | 2004-07-28 21:27:43 +0000 (Wed, 28 Jul 2004) | 2 lines
+
+bug 3599: don't add -Wall to CFLAGS unless we believe GCC is being used
+
+------------------------------------------------------------------------
+r30835 | quinlan | 2004-07-28 09:16:13 +0000 (Wed, 28 Jul 2004) | 2 lines
+
+port 587 is open for business
+
+------------------------------------------------------------------------
+r30811 | sidney | 2004-07-28 03:39:22 +0000 (Wed, 28 Jul 2004) | 1 line
+
+bug 3506: fix declaration of size_t var that should have been ssize_t, use int instead of ssize_t, test for timeout before test for newline
+------------------------------------------------------------------------
+r30803 | mss | 2004-07-27 20:56:38 +0000 (Tue, 27 Jul 2004) | 2 lines
+
+Reverted last commit as per Daniel's veto.
+
+------------------------------------------------------------------------
+r30793 | mss | 2004-07-27 18:27:23 +0000 (Tue, 27 Jul 2004) | 2 lines
+
+bug 3599: Removed -Wall from the CFLAGS for now to make it compile with non-GCC compilers. The file configure.in is currently broken and needs some love for 3.1.
+
+------------------------------------------------------------------------
+r30725 | felicity | 2004-07-26 17:38:16 +0000 (Mon, 26 Jul 2004) | 1 line
+
+setting executable property on score-generation scripts which were mising them
+------------------------------------------------------------------------
+r30724 | hstern | 2004-07-26 17:22:40 +0000 (Mon, 26 Jul 2004) | 13 lines
+
+
+* compare-models
+* config.set0
+* config.set1
+* extract-results
+* generate-corpus
+* tenpass/split-log-into-buckets-random
+* validate-model
+
+  Fixing wierdness from previous commit where contents of new files were
+  duplicated.
+
+
+------------------------------------------------------------------------
+r30702 | mss | 2004-07-25 23:40:03 +0000 (Sun, 25 Jul 2004) | 4 lines
+
+Another rather trivial change to add some debugging output before sockets are established. Should help a lot to track down stuff like <http://bugs.gentoo.org/show_bug.cgi?id=58122>.
+
+The whole code around there is currently a real mess, I'll refactor it for 3.1.
+
+------------------------------------------------------------------------
+r30701 | mss | 2004-07-25 22:41:26 +0000 (Sun, 25 Jul 2004) | 2 lines
+
+Trivial change to error output ("$! $@" -> "$! ($@)")
+
+------------------------------------------------------------------------
+r23229 | quinlan | 2004-07-25 05:30:00 +0000 (Sun, 25 Jul 2004) | 2 lines
+
+bug 3633: trivial fix as suggested by Bob Menschel
+
+------------------------------------------------------------------------
+r23195 | mss | 2004-07-23 18:41:19 +0000 (Fri, 23 Jul 2004) | 2 lines
+
+A small typo.
+
+------------------------------------------------------------------------
+r23193 | mss | 2004-07-23 18:32:55 +0000 (Fri, 23 Jul 2004) | 2 lines
+
+A little additional wordsmithing, but Klaus did a great job, most the time I just corrected some lower-case characters (please use "E-Mail" instead of "E-mail" or even "e-Mail" in German).
+
+------------------------------------------------------------------------
+r23192 | felicity | 2004-07-23 15:43:19 +0000 (Fri, 23 Jul 2004) | 1 line
+
+update changes file for pre3
+------------------------------------------------------------------------
+r23178 | jm | 2004-07-23 04:08:57 +0000 (Fri, 23 Jul 2004) | 1 line
+
+3.0.0-pre4 devel cycle started
+------------------------------------------------------------------------
+r23175 | jm | 2004-07-23 04:06:20 +0000 (Fri, 23 Jul 2004) | 1 line
+
+3.0.0-pre3 RELEASED
+------------------------------------------------------------------------
 r23174 | quinlan | 2004-07-23 03:58:25 +0000 (Fri, 23 Jul 2004) | 2 lines
 
 move URIBL rules to URIBL section and zero their non-net scores

Modified: spamassassin/trunk/build/README
==============================================================================
--- spamassassin/trunk/build/README	(original)
+++ spamassassin/trunk/build/README	Wed Aug  4 20:06:01 2004
@@ -5,8 +5,16 @@
 - cd to the directory for the codebase you want the devel tree to
   come from
 
-    su - release
-    cd ~release/versions/cvshead
+    ssh minotaur.apache.org
+    cd [checkedoutdir]
+
+- ensure the required code and data is available for the build scripts:
+
+    ~/sabuildtools
+    ~/sasigningkey
+    ~/perl584
+
+  All can be copied from ~jm on minotaur if required.
 
 - run "./build/update_devel" to build the tar.gz files
 
@@ -21,8 +29,11 @@
 - cd to the directory for the codebase you want the release to
   come from
 
-    su - release
-    cd ~release/versions/cvshead
+    ssh minotaur.apache.org
+    cd [checkedoutdir]
+
+- ensure the required code and data is available for the build scripts:
+  see above.
 
 - edit lib/Mail/SpamAssassin.pm and comment the $IS_DEVEL_BUILD
   line.   Ensure the correct version number is present in $VERSION

Modified: spamassassin/trunk/lib/Mail/SpamAssassin.pm
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin.pm	(original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin.pm	Wed Aug  4 20:06:01 2004
@@ -99,7 +99,7 @@
 };
 
 $VERSION = "3.000000";      # update after release (same format as perl $])
-$IS_DEVEL_BUILD = 1;        # change for release versions
+# $IS_DEVEL_BUILD = 1;        # change for release versions
 
 @ISA = qw();
 
@@ -108,7 +108,7 @@
 
 # If you hacked up your SA, you should add a version_tag to you .cf files.
 # This variable should not be modified directly.
-@EXTRA_VERSION = qw(pre3);
+@EXTRA_VERSION = qw(pre4);
 if (defined $IS_DEVEL_BUILD && $IS_DEVEL_BUILD) {
   push(@EXTRA_VERSION, ( 'r' . qw{$LastChangedRevision$ updated by SVN}[1] ));
 }

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Hashcash.pm
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Hashcash.pm	(original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/Plugin/Hashcash.pm	Wed Aug  4 20:06:01 2004
@@ -155,18 +155,58 @@
   if (defined $scanner->{hashcash_value}) { return $scanner->{hashcash_value}; }
 
   $scanner->{hashcash_value} = 0;
-  my $hc = $scanner->get ("X-Hashcash");
+
   # X-Hashcash: 0:031118:camram-spam@camram.org:c068b58ade6dcbaf
+  # or:
+  # X-hashcash: 1:20:040803:hashcash@freelists.org::6dcdb3a3ad4e1b86:1519d
+  # X-hashcash: 1:20:040803:jm@jmason.org::6b484d06469ccb28:8838a
+  # X-hashcash: 1:20:040803:adam@cypherspace.org::a1cbc54bf0182ea8:5d6a0
+
+  # call down to {msg} so that we can get it as an array of
+  # individual headers
+  my @hdrs = $scanner->{msg}->get_header ("X-Hashcash");
+
+  foreach my $hc (@hdrs) {
+    my $value = $self->_run_hashcash_for_one_string($scanner, $hc);
+    if ($value) {
+      # remove the "double-spend" bool if we did find a usable string;
+      # this happens when one string is already spent, but another
+      # string has not yet been.
+      delete $scanner->{hashcash_double_spent};
+      return $value;
+    }
+  }
+  return 0;
+}
+
+sub _run_hashcash_for_one_string {
+  my ($self, $scanner, $hc) = @_;
 
   if (!$hc) { return 0; }
+  $hc =~ s/\s+//gs;       # remove whitespace from multiline, folded tokens
 
   # untaint the string for paranoia, making sure not to allow \n \0 \' \"
   $hc =~ /^([-A-Za-z0-9\xA0-\xFF:_\/\%\@\.\,\= \*\+]+)$/; $hc = $1;
   if (!$hc) { return 0; }
 
-  my ($ver, $date, $rsrc, $trial);
-  ($ver, $date, $rsrc, $trial) = ($hc =~ /(\S+):(\S+):(\S+):(\S+)/ );
-  if (!$trial) { return 0; }
+  my ($ver, $bits, $date, $rsrc, $exts, $rand, $trial);
+  if ($hc =~ /^0:/) {
+    ($ver, $date, $rsrc, $trial) = split (/:/, $hc, 4);
+  }
+  elsif ($hc =~ /^1:/) {
+    ($ver, $bits, $date, $rsrc, $exts, $rand, $trial) =
+                                    split (/:/, $hc, 7);
+    # extensions are, as yet, unused by SpamAssassin
+  }
+  else {
+    dbg ("hashcash: version $ver stamps not yet supported");
+    return 0;
+  }
+
+  if (!$trial) {
+    dbg ("hashcash: no trial in stamp '$hc'");
+    return 0;
+  }
 
   my $accept = $scanner->{conf}->{hashcash_accept};
   if (!$self->_check_hashcash_resource ($scanner, $accept, $rsrc)) {
@@ -248,7 +288,8 @@
   foreach my $regexp (values %{$list})
   {
     # allow %u == current username
-    $regexp =~ s/\%u/$scanner->{main}->{username}/gs;
+    # \\ is added by $conf->add_to_addrlist()
+    $regexp =~ s/\\\%u/$scanner->{main}->{username}/gs;
 
     if ($addr =~ /$regexp/i) {
       return 1;

Modified: spamassassin/trunk/masses/parse-rules-for-masses
==============================================================================
--- spamassassin/trunk/masses/parse-rules-for-masses	(original)
+++ spamassassin/trunk/masses/parse-rules-for-masses	Wed Aug  4 20:06:01 2004
@@ -66,11 +66,22 @@
   foreach my $indir (@_) {
     my @files = <$indir/[0-9]*.cf>;
     my $file;
+    my $scores_mutable = 1;
     %rulesfound = ();
     %langs = ();
     foreach $file (sort @files) {
       open (IN, "<$file");
-      while (<IN>) {
+      while (<IN>)
+      {
+        # these appear in comments, so deal with them before comment stripping
+        # takes place
+        if (/<\/gen:mutable>/i) {
+          $scores_mutable = 0;
+        }
+        elsif (/<gen:mutable>/i) {
+          $scores_mutable = 1;
+        }
+
         s/#.*$//g; s/^\s+//; s/\s+$//; next if /^$/;
 
         my $lang = '';
@@ -106,6 +117,7 @@
 	    ($score) = (split(/\s+/,$score))[$scoreset];
 	  }
           $rules->{$name}->{score} = $score;
+          $rules->{$name}->{mutable} = $scores_mutable;
         }
       }
       close IN;
@@ -127,6 +139,10 @@
       } else {
         $rules->{$rule}->{score} = $def;
       }
+
+      # if a rule didn't have a score specified, assume it's
+      # mutable
+      $rules->{$name}->{mutable} = 1;
     }
   }
 }

Modified: spamassassin/trunk/masses/rewrite-cf-with-new-scores
==============================================================================
--- spamassassin/trunk/masses/rewrite-cf-with-new-scores	(original)
+++ spamassassin/trunk/masses/rewrite-cf-with-new-scores	Wed Aug  4 20:06:01 2004
@@ -161,6 +161,7 @@
   my ($name, @scores) = @_;
 
   my $isnet = ($rules{$name}->{tflags} =~ /\bnet\b/);
+  my $islearn = ($rules{$name}->{tflags} =~ /\blearn\b/);
 
   # Set defaults if not already set
   $scores[0] ||= 0;
@@ -175,6 +176,10 @@
   # net rules never have a non-zero score in sets 0 and 2
   for(my $i=0;$i<$NUM_SCORESETS;$i++) {
     if ($isnet && ($i & 1) == 0) {
+      $scores[$i] = 0;
+      $flag = 0 if ( $i > 0 && $scores[$i] != $scores[$i-1] );
+    }
+    if ($islearn && ($i & 2) == 0) {
       $scores[$i] = 0;
       $flag = 0 if ( $i > 0 && $scores[$i] != $scores[$i-1] );
     }

Modified: spamassassin/trunk/masses/score-ranges-from-freqs
==============================================================================
--- spamassassin/trunk/masses/score-ranges-from-freqs	(original)
+++ spamassassin/trunk/masses/score-ranges-from-freqs	Wed Aug  4 20:06:01 2004
@@ -103,19 +103,31 @@
   $freq_nonspam{$test} = $nonspam;
 
   my $tflags = $rules{$test}->{tflags}; $tflags ||= '';
+
+  # "userconf" rules, or "net" rules in set 0/2, or "learn" rules
+  # in set 1/3, are nonmutable.
   if ($tflags =~ /\buserconf\b/ ||
-      ( ($scoreset % 2) == 0 && $tflags =~ /\bnet\b/ )) {
+      ( ($scoreset % 2) == 0 && $tflags =~ /\bnet\b/ ) ||
+      ( ($scoreset % 2) == 1 && $tflags =~ /\blearn\b/ ))
+  {
     $mutable_tests{$test} = 0;
   } else {
     $mutable_tests{$test} = 1;
   }
+
+  # rules read from the non-mutable section
+  if (!$rules{$test}->{mutable}) {
+    $mutable_tests{$test} = 0;
+  }
+
   if ($tflags =~ m/\bnice\b/i) {
     $is_nice{$test} = 1;
   } else {
     $is_nice{$test} = 0;
   }
 
-  if ($overall < 0.01) {        # less than 0.01% of messages were hit
+  # less than 0.01% of messages were hit: force these rules to 0.0
+  if ($overall < 0.01) {
     $mutable_tests{$test} = 0;
     $soratio{$test} = 0.5;
     $ranking{$test} = 0.0;
@@ -146,13 +158,11 @@
   my $ranking = $ranking{$test};
   my $mutable = $mutable_tests{$test};
 
-  # look for score of 0
-  # TODO: *why* do we do this?  it results in really good rules
-  # being disabled sometimes!
+  # non-mutable, or score of 0 -- lock down to current score.
   if (!$mutable || $rules{$test}->{score} == 0) {
     printf OUT ("%3.3f %3.3f 0 $test\n",
-                         $rules{$test}->{score},
-                         $rules{$test}->{score});
+                              $rules{$test}->{score},
+                              $rules{$test}->{score});
     next;
   }
 

Modified: spamassassin/trunk/rules/50_scores.cf
==============================================================================
--- spamassassin/trunk/rules/50_scores.cf	(original)
+++ spamassassin/trunk/rules/50_scores.cf	Wed Aug  4 20:06:01 2004
@@ -30,7 +30,7 @@
 # weighted to produce roughly 1 false positive in 1000 non-spam messages
 # using the default threshold of 5.0.
 
-# Start of generated scores
+# Start of generated scores.  <gen:mutable>
 
 score ACCEPT_CREDIT_CARDS 0.607 0.826 0.607 0.607
 score ACT_NOW_CAPS 0.162 0.483 0.379 0.357
@@ -809,7 +809,7 @@
 score BAYES_95 0 0 1.365 2.063
 score BAYES_99 0 0 1.673 1.886
 
-# End of generated scores.
+# End of generated scores.  </gen:mutable>
 
 # Scores for tests that are scored manually or with isolated rescore runs.
 # Most are net tests, userconf tests, tests occuring with very low frequency,