You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by do...@apache.org on 2007/04/18 05:40:30 UTC

svn commit: r529848 - /spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/

Author: dos
Date: Tue Apr 17 20:40:28 2007
New Revision: 529848

URL: http://svn.apache.org/viewvc?view=rev&rev=529848
Log:
first whack at generating scores for newly promoted rules... fix scores for the rules from the base release, let the garescorer assign scores to the new rules based on the last 2 months of mail appearing in the nightly mass-check logs

Added:
    spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/
    spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/add-hitless-active-to-freqs   (with props)
    spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/extract-new-scores   (with props)
    spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/generate-new-scores   (with props)
    spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/lock-scores   (with props)
    spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/masses-Makefile.patch
    spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/merge-scoresets   (with props)
    spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/scores
    spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/scores-set0
    spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/scores-set1
    spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/scores-set2
    spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/scores-set3
    spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/stats-set0
    spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/stats-set1
    spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/stats-set2
    spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/stats-set3

Added: spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/add-hitless-active-to-freqs
URL: http://svn.apache.org/viewvc/spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/add-hitless-active-to-freqs?view=auto&rev=529848
==============================================================================
--- spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/add-hitless-active-to-freqs (added)
+++ spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/add-hitless-active-to-freqs Tue Apr 17 20:40:28 2007
@@ -0,0 +1,55 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+my %active_rules;
+my %original_rules;
+
+open(ACTIVE, "../rules/active.list") or die "Cannot open active.list: $!";
+while(<ACTIVE>) {
+  $active_rules{$1} = undef if (/^(?!__)(\S+)$/);
+}
+close ACTIVE;
+
+open(ORIG, "../rules/50_scores.cf") or die "Cannot open original score file: $!";
+while(<ORIG>) {
+  if (/^score\s+(\S+)/) {
+    $original_rules{$1} = undef;
+  }
+}
+close ORIG;
+
+open(FREQS, "freqs") or die "Cannot open freqs: $!";
+readline(FREQS);
+readline(FREQS);
+readline(FREQS);
+while (<FREQS>) {
+  if (/(\S+)$/) {
+    delete $active_rules{$1};
+  }
+}
+close FREQS;
+
+open(SCORES, ">scores-active-zeroed") or die "Cannot open scores-active-zeroed: $!";
+if (scalar keys %active_rules) {
+  open(FREQS, ">>freqs") or die "Cannot open freqs: $!";
+  foreach my $rule (sort(keys %active_rules)) {
+    # no need to get the real scores for the base rules since there's no hits
+    # on them their score doesn't matter
+    print FREQS "  0.000   0.0000   0.0000    0.500   0.00    0.00  $rule\n";
+
+    # generate zero score lines for active.list non-base ruleset rules that are hitless
+    # skip the AWL rule, it doesn't have a static score  TODO: detect this automatically?!
+    unless (exists $original_rules{$rule} || $rule =~ /^AWL$/) {
+      my $line = "score $rule ";
+      for (my $i = 0; $i < 30 - length $rule; $i++) {
+        $line .= ' ';
+      }
+      $line .= "0.000\n";
+      print SCORES $line;
+    }
+  }
+  close FREQS;
+}
+close SCORES;

Propchange: spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/add-hitless-active-to-freqs
------------------------------------------------------------------------------
    svn:executable = *

Added: spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/extract-new-scores
URL: http://svn.apache.org/viewvc/spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/extract-new-scores?view=auto&rev=529848
==============================================================================
--- spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/extract-new-scores (added)
+++ spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/extract-new-scores Tue Apr 17 20:40:28 2007
@@ -0,0 +1,46 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+my $scoreset = 0; # default
+my $ham_pref = 5.0;
+my $threshold = 5.0;
+my $epochs = 100;
+my $note = '';
+
+my %original_rules;
+
+open(CONFIG, "config") or die "Cannot open config file: $!";
+while (<CONFIG>) {
+  $scoreset = $1 if /^\s*SCORESET=(\d)\s*$/;
+  $ham_pref = $1 if /^\s*HAM_PREFERENCE=([\d.-]+)/;
+  $threshold = $1 if /^\s*THRESHOLD=([\d.-]+)/;
+  $epochs = $1 if /^\s*EPOCHS=(\d)\s*$/;
+  $note = "-$1" if /^\s*NOTE=(.+)$/;
+
+
+}
+close CONFIG;
+
+print "Removing scores for base release rules from newly generated scores\n";
+
+open(ORIG, "../rules/50_scores.cf") or die "Cannot open original score file: $!";
+while(<ORIG>) {
+  if (/^score\s+(\S+)/) {
+    $original_rules{$1} = undef;
+  }
+}
+close ORIG;
+
+
+open(ORIG, "gen-set$scoreset-$ham_pref-$threshold-$epochs-ga$note/scores") or die "Cannot open original scores file: $!";
+open(NEW, ">gen-set$scoreset-$ham_pref-$threshold-$epochs-ga$note/scores-new") or die "Cannot open scores-new file: $!";
+while (<ORIG>) {
+  if (/^score\s+(\S+)/) {
+    next if exists $original_rules{$1};
+    print NEW $_;
+  }
+}
+close ORIG;
+close NEW;

Propchange: spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/extract-new-scores
------------------------------------------------------------------------------
    svn:executable = *

Added: spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/generate-new-scores
URL: http://svn.apache.org/viewvc/spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/generate-new-scores?view=auto&rev=529848
==============================================================================
--- spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/generate-new-scores (added)
+++ spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/generate-new-scores Tue Apr 17 20:40:28 2007
@@ -0,0 +1,161 @@
+#!/bin/sh
+
+# generate-new-scores - generate scores for rules promoted after initial
+#                       release mass-check scoring run
+#
+# usage: generate-new-scores (0|1|2|3)
+#
+# <@LICENSE>
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at:
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </...@LICENSE>
+
+SCORESET=$1
+
+# load rsync credentials from RSYNC-CREDS file
+# RSYNC_USERNAME="username"
+# RSYNC_PASSWORD="password"
+. RSYNC-CREDS
+export RSYNC_PASSWORD
+
+if [ ! $SCORESET ]; then
+  echo "Missing scoreset number parameter"
+  exit
+fi
+
+# prep current nightly mass-check logs
+if [ ! -e corpus ]; then
+  mkdir corpus
+fi
+cd corpus
+rsync -artvz $RSYNC_USERNAME@rsync.spamassassin.org::corpus/*.log .
+
+# select a usable corpus (it'll use all available logs for the wanted score set
+# with the most recent revision found among logs for that score set)
+rm -rf usable-corpus
+mkdir usable-corpus
+
+if [ $SCORESET -eq 1 -o $SCORESET -eq 3 ]; then
+  for FILE in `find -type f | grep am-net-`;
+  do
+    ln $FILE usable-corpus/$FILE
+  done
+else
+  for FILE in `find -type f | grep -v am-net-`;
+  do
+    ln $FILE usable-corpus/$FILE
+  done
+fi
+  
+cd usable-corpus
+REVISION=`head * | grep "SVN revision" | cut -d" " -f4 | sort -rn | head -1`
+for FILE in `find -type f`;
+do
+  head $FILE | grep "SVN revision: $REVISION" || rm $FILE
+done
+
+# cthielen's ham logs seem to have a shitload of spam in them
+rm -f *cthielen.log
+
+cd ../..
+
+# prep the ruleset checkout
+rm -rf trunk-new-rules
+
+svn co -r $REVISION https://svn.apache.org/repos/asf/spamassassin/trunk trunk-new-rules
+svn co https://svn.apache.org/repos/asf/spamassassin/tags/spamassassin_release_3_2_0_rc_2/rules trunk-new-rules/rules-base
+
+patch -p0 < masses-Makefile.patch
+cp -a lock-scores trunk-new-rules/masses/lock-scores
+cp -a extract-new-scores trunk-new-rules/masses/extract-new-scores
+cp -a add-hitless-active-to-freqs trunk-new-rules/masses/add-hitless-active-to-freqs
+
+cd trunk-new-rules
+perl Makefile.PL < /dev/null
+make
+
+# strip scores from new rules so that the garescorer can set them
+grep -v ^score rules/72_active.cf > rules/72_active.cf-scoreless
+mv -f rules/72_active.cf-scoreless rules/72_active.cf
+
+chmod +x masses/log-grep-recent	# this can go after April 21, 2007
+
+masses/log-grep-recent -m 38 ../corpus/usable-corpus/ham-*.log > masses/ham-full.log
+masses/log-grep-recent -m 2 ../corpus/usable-corpus/spam-*.log > masses/spam-full.log
+
+# set config to chosen scoreset
+cp masses/config.set$SCORESET masses/config
+. masses/config
+NAME="set$SCORESET"
+LOGDIR="gen-$NAME-$HAM_PREFERENCE-$THRESHOLD-$EPOCHS-ga"
+
+# generate new ruleset
+cd masses
+
+make clean
+rm -rf ORIG NSBASE SPBASE ham-validate.log spam-validate.log ham.log spam.log
+ln -s ham-full.log ham.log
+ln -s spam-full.log spam.log
+make freqs SCORESET=$SCORESET
+
+cp freqs freqs.full	# probably not needed for anything - someday I'll look to see
+make > make.out 2>&1
+
+rm -rf ORIG NSBASE SPBASE ham-validate.log spam-validate.log ham.log spam.log
+mkdir ORIG
+for CLASS in ham spam ; do
+  ln $CLASS-full.log ORIG/$CLASS.log
+  for I in 0 1 2 3 ; do
+    ln -s $CLASS.log ORIG/$CLASS-set$I.log
+  done
+done
+
+# generate the new scores
+./runGA
+
+# generate stats on the old rules to compare against the new rules and their scores
+./fp-fn-statistics --ham ham-test.log --spam spam-test.log --scoreset $SCORESET \
+	--cffile=../rules-base --fnlog $LOGDIR/false_negatives_original \
+	--fplog $LOGDIR/false_positives_original > $LOGDIR/stats-set$SCORESET-original-test
+
+./fp-fn-statistics --ham ham.log --spam spam.log --scoreset $SCORESET \
+	--cffile=../rules-base --fnlog $LOGDIR/false_negatives_original \
+	--fplog $LOGDIR/false_positives_original > $LOGDIR/stats-set$SCORESET-original-full
+
+# extract the new scores
+./extract-new-scores
+mv $LOGDIR/scores-new ../../scores-set$SCORESET
+
+# new active.list rules that didn't hit enough get zeroed... add the zero scores
+# for them, otherwise SA will assign 1.0 defaults (or use whatever was in the sandbox)
+if [ -s scores-active-zeroed ]; then
+  echo "# in active.list but have no hits in recent corpus" >> ../../scores-set$SCORESET
+  cat scores-active-zeroed >> ../../scores-set$SCORESET
+fi
+
+cd ../..
+./merge-scoresets
+echo
+cat scores
+
+# collect some stats
+echo "##### WITH NEW RULES AND SCORES #####" > stats-set$SCORESET
+head -10 trunk-new-rules/masses/$LOGDIR/scores >> stats-set$SCORESET
+cat trunk-new-rules/masses/$LOGDIR/test >> stats-set$SCORESET
+echo >> stats-set$SCORESET
+echo "##### WITHOUT NEW RULES AND SCORES #####" >> stats-set$SCORESET
+cat trunk-new-rules/masses/$LOGDIR/stats-set$SCORESET-original-full >> stats-set$SCORESET
+cat trunk-new-rules/masses/$LOGDIR/stats-set$SCORESET-original-test >> stats-set$SCORESET
+

Propchange: spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/generate-new-scores
------------------------------------------------------------------------------
    svn:executable = *

Added: spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/lock-scores
URL: http://svn.apache.org/viewvc/spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/lock-scores?view=auto&rev=529848
==============================================================================
--- spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/lock-scores (added)
+++ spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/lock-scores Tue Apr 17 20:40:28 2007
@@ -0,0 +1,50 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+my $scoreset = 0; # default
+
+my %rulescores;
+
+open(CONFIG, "config") or die "Cannot open config file: $!";
+while (<CONFIG>) {
+  /^\s*SCORESET=(\d)\s*$/;
+  $scoreset = $1;
+  # don't exit loop in case scoreset appears in config again
+}
+close CONFIG;
+
+print "Fixing score range for existing rules to current scoreset $scoreset score\n";
+
+open(ORIG, "../rules/50_scores.cf") or die "Cannot open original score file: $!";
+while(<ORIG>) {
+  if (/^score/) {
+    /^score\s+(\S+)\s+(-?[\d.]+)(?:\s+(-?[\d.]+)\s+(-?[\d.]+)\s+(-?[\d.]+))?/;
+    my @scores;
+    if (defined $3) {
+      push @scores, ($2, $3, $4, $5);
+    } else {
+      push @scores, ($2, $2, $2, $2);
+    }
+    $rulescores{$1} = $scores[$scoreset];
+  }
+}
+close ORIG;
+
+
+open(ORIG, "tmp/ranges.data") or die "Cannot open original range.data file: $!";
+open(NEW, ">tmp/ranges.data-new") or die "Cannot open range.data-new file: $!";
+while (<ORIG>) {
+  if (/^(?:(?:-?[\d.]+) ){3}(\S+)$/) {
+    if (defined $rulescores{$1}) {
+      print NEW "$rulescores{$1} $rulescores{$1} 0 $1\n";
+    } else {
+      print NEW $_;
+    }
+  } else {
+    print NEW $_;
+  }
+}
+close ORIG;
+close NEW;

Propchange: spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/lock-scores
------------------------------------------------------------------------------
    svn:executable = *

Added: spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/masses-Makefile.patch
URL: http://svn.apache.org/viewvc/spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/masses-Makefile.patch?view=auto&rev=529848
==============================================================================
--- spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/masses-Makefile.patch (added)
+++ spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/masses-Makefile.patch Tue Apr 17 20:40:28 2007
@@ -0,0 +1,15 @@
+Index: Makefile
+===================================================================
+--- trunk-new-rules/masses/Makefile	(revision 529172)
++++ trunk-new-rules/masses/Makefile	(working copy)
+@@ -36,7 +36,10 @@
+ tmp/scores.h: tmp/tests.h
+ 
+ tmp/ranges.data: tmp/.created freqs score-ranges-from-freqs
++	perl add-hitless-active-to-freqs
+ 	perl score-ranges-from-freqs $(RULES) $(SCORESET) < freqs
++	perl lock-scores
++	mv tmp/ranges.data-new tmp/ranges.data
+ 
+ freqs: spam.log ham.log
+ 	perl hit-frequencies -c $(RULES) -x -p -s $(SCORESET) > freqs

Added: spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/merge-scoresets
URL: http://svn.apache.org/viewvc/spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/merge-scoresets?view=auto&rev=529848
==============================================================================
--- spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/merge-scoresets (added)
+++ spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/merge-scoresets Tue Apr 17 20:40:28 2007
@@ -0,0 +1,29 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+my $min_score = 0.050;
+
+my %rules;
+
+for (my $i = 0; $i < 4; $i++) {
+  open (SCORES, "scores-set$i") or die "Cannot open scores-set$i: $!";
+  while(<SCORES>) {
+    next unless /^score (\S+)\s+(-?[\d.]+)$/;
+    @{$rules{$1}} = ('0.000', '0.000' ,'0.000', '0.000') unless exists $rules{$1};
+    $rules{$1}[$i] = ($2 >= $min_score ? $2 : '0.000');
+  }
+  close SCORES;
+}
+
+open (SCORES, ">scores") or die "Cannot open scores: $!";
+  foreach my $rule (sort(keys %rules)) {
+    my $line = "score $rule ";
+    for (my $i = 0; $i < 30 - length $rule; $i++) {
+      $line .= ' ';
+    }
+    $line .= join(' ', @{$rules{$rule}})."\n";
+    print SCORES $line;
+  }
+close SCORES;

Propchange: spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/merge-scoresets
------------------------------------------------------------------------------
    svn:executable = *

Added: spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/scores
URL: http://svn.apache.org/viewvc/spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/scores?view=auto&rev=529848
==============================================================================
    (empty)

Added: spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/scores-set0
URL: http://svn.apache.org/viewvc/spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/scores-set0?view=auto&rev=529848
==============================================================================
    (empty)

Added: spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/scores-set1
URL: http://svn.apache.org/viewvc/spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/scores-set1?view=auto&rev=529848
==============================================================================
    (empty)

Added: spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/scores-set2
URL: http://svn.apache.org/viewvc/spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/scores-set2?view=auto&rev=529848
==============================================================================
    (empty)

Added: spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/scores-set3
URL: http://svn.apache.org/viewvc/spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/scores-set3?view=auto&rev=529848
==============================================================================
    (empty)

Added: spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/stats-set0
URL: http://svn.apache.org/viewvc/spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/stats-set0?view=auto&rev=529848
==============================================================================
    (empty)

Added: spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/stats-set1
URL: http://svn.apache.org/viewvc/spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/stats-set1?view=auto&rev=529848
==============================================================================
    (empty)

Added: spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/stats-set2
URL: http://svn.apache.org/viewvc/spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/stats-set2?view=auto&rev=529848
==============================================================================
    (empty)

Added: spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/stats-set3
URL: http://svn.apache.org/viewvc/spamassassin/rules/trunk/sandbox/dos/new-rule-score-gen/stats-set3?view=auto&rev=529848
==============================================================================
    (empty)