You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2007/05/05 14:35:45 UTC

svn commit: r535514 - /spamassassin/trunk/masses/rule-qa/corpus-hourly

Author: jm
Date: Sat May  5 05:35:44 2007
New Revision: 535514

URL: http://svn.apache.org/viewvc?view=rev&rev=535514
Log:
create temporary copies of the log files we process, to avoid race conditions where rsyncd uploads a new rev which we then think is data from an OLD rev

Modified:
    spamassassin/trunk/masses/rule-qa/corpus-hourly

Modified: spamassassin/trunk/masses/rule-qa/corpus-hourly
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/rule-qa/corpus-hourly?view=diff&rev=535514&r1=535513&r2=535514
==============================================================================
--- spamassassin/trunk/masses/rule-qa/corpus-hourly (original)
+++ spamassassin/trunk/masses/rule-qa/corpus-hourly Sat May  5 05:35:44 2007
@@ -6,13 +6,13 @@
 use Getopt::Long;
 
 use vars qw(
-    $corpusdir
+    $realcorpusdir
     $opt_override
     $opt_tag
 );
 GetOptions(
     "tag=s" => \$opt_tag,
-    "dir=s" => \$corpusdir,
+    "dir=s" => \$realcorpusdir,
     "override=s" => \$opt_override,
 );
 
@@ -51,11 +51,23 @@
 &configure;
 &init;
 
-if ($corpusdir) {
-  print "reading logs from '$corpusdir'\n";
-}
-else {
-  $corpusdir = $opt{corpus};
+my $corpusdir;
+
+if ($realcorpusdir) {
+  print "reading logs from '$realcorpusdir'\n";
+
+  # create a temp dir to hold hard links to the files we're working on.  This
+  # is used so that the rsyncd can upload new source files, replacing
+  # our work files, without affecting us.  use hard links for speed and
+  # efficiency
+  $corpusdir = "$opt{tmp}/parse.$$";
+  mkdir $corpusdir or die "cannot mkdir $corpusdir";
+  push @tmps, $corpusdir;
+  print "using $corpusdir to hold temporary links to processed logs\n";
+
+} else {
+  $realcorpusdir = $opt{corpus};
+  $corpusdir = $realcorpusdir;      # no need to take copies
   &update_rsync;
 }
 
@@ -84,7 +96,7 @@
 }
 
 sub clean_up {
-  system "rm -f $opt{tmp}/*.$$ ".join(' ', @tmps);
+  system "rm -rf $opt{tmp}/*.$$ ".join(' ', @tmps);
 }
 
 sub init {
@@ -97,7 +109,7 @@
 }
 
 sub update_rsync {
-  chdir $corpusdir;
+  chdir $realcorpusdir;
 
   # allow non-running of rsync under some circumstances
   if ($opt{rsync_command}) {
@@ -134,19 +146,19 @@
 }
 
 sub locate {
-  opendir(CORPUS, $corpusdir);
+  opendir(CORPUS, $realcorpusdir);
   @files = sort readdir(CORPUS);
   closedir(CORPUS);
 
   @files = grep {
-    /^(?:spam|ham)-(?:net-)?[-\w]+\.log$/ && -f "$corpusdir/$_" && -M _ < 10 
+    /^(?:spam|ham)-(?:net-)?[-\w]+\.log$/ && -f "$realcorpusdir/$_" && -M _ < 10 
   } @files;
 
   foreach my $file (@files) {
     my $tag = 0;
     my $headers = '';
 
-    open(FILE, "$corpusdir/$file") or warn "cannot read $corpusdir/$file";
+    open(FILE, "$realcorpusdir/$file") or warn "cannot read $realcorpusdir/$file";
     while (my $line = <FILE>) {
       last if $line !~ /^#/;
       $headers .= $line;
@@ -168,18 +180,18 @@
     }
     close(FILE);
 
-    my @s = stat("$corpusdir/$file");
+    my @s = stat("$realcorpusdir/$file");
     $filesize{$file} = $s[7];
     $mtime{$file} = $s[9];
 
     if (!defined $time{$file}) {
-      warn "$corpusdir/$file: no time found, ignored\n"; next;
+      warn "$realcorpusdir/$file: no time found, ignored\n"; next;
     }
     if (!defined $revision{$file}) {
-      warn "$corpusdir/$file: no revision found, ignored\n"; next;
+      warn "$realcorpusdir/$file: no revision found, ignored\n"; next;
     }
     if ($revision{$file} eq 'unknown') {
-      warn "$corpusdir/$file: not tagged with a revision, ignored\n"; next;
+      warn "$realcorpusdir/$file: not tagged with a revision, ignored\n"; next;
     }
 
     my $daterev = mk_daterev($time{$file},$revision{$file},$opt_tag);
@@ -189,13 +201,16 @@
 
     if ($file =~ /-net-/) {
       $is_net_daterev{$daterev} = 1;
-      print "$corpusdir/$file: rev=$daterev time=$time{$file} (set 1)\n";
+      print "$realcorpusdir/$file: rev=$daterev time=$time{$file} (set 1)\n";
     }
     else {
-      print "$corpusdir/$file: rev=$daterev time=$time{$file} (set 0)\n";
+      print "$realcorpusdir/$file: rev=$daterev time=$time{$file} (set 0)\n";
     }
 
     get_rulemetadata_for_revision($daterev, $revision{$file});
+
+    link ("$realcorpusdir/$file", "$corpusdir/$file")
+            or die "cannot ln $realcorpusdir/$file to $corpusdir";
   }
 }
 



ruleqa broken [Re: svn commit: r535514 - /spamassassin/trunk/masses/rule-qa/corpus-hourly]

Posted by "Daryl C. W. O'Shea" <sp...@dostech.ca>.
jm@apache.org wrote:
> Author: jm
> Date: Sat May  5 05:35:44 2007
> New Revision: 535514
> 
> URL: http://svn.apache.org/viewvc?view=rev&rev=535514
> Log:
> create temporary copies of the log files we process, to avoid race conditions where rsyncd uploads a new rev which we then think is data from an OLD rev
> 
> Modified:
>     spamassassin/trunk/masses/rule-qa/corpus-hourly

This or one of the revisions up to r535536 (I haven't looked at them) 
seems to have made things worse.

Sunday's active.list had a lot of rules, including all the new sandbox 
rules, removed from it and (probably related) at the moment ruleqa is 
only showing logs from "bb-doc jm" for r535586.  I see logs from the 
usual submitters for this rev on the server:

ham-bb-doc.log    ham-bb-jm.log   ham-daf.log  ham-jm.log 
spam-bb-doc.log    spam-bb-jm.log   spam-daf.log  spam-jm.log
ham-bb-fredt.log  ham-bb-zmi.log  ham-dos.log  ham-theo.log 
spam-bb-fredt.log  spam-bb-zmi.log  spam-dos.log  spam-theo.log


Daryl