You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2007/05/05 14:35:45 UTC
svn commit: r535514 - /spamassassin/trunk/masses/rule-qa/corpus-hourly
Author: jm
Date: Sat May 5 05:35:44 2007
New Revision: 535514
URL: http://svn.apache.org/viewvc?view=rev&rev=535514
Log:
create temporary copies of the log files we process, to avoid race conditions where rsyncd uploads a new rev which we then think is data from an OLD rev
Modified:
spamassassin/trunk/masses/rule-qa/corpus-hourly
Modified: spamassassin/trunk/masses/rule-qa/corpus-hourly
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/rule-qa/corpus-hourly?view=diff&rev=535514&r1=535513&r2=535514
==============================================================================
--- spamassassin/trunk/masses/rule-qa/corpus-hourly (original)
+++ spamassassin/trunk/masses/rule-qa/corpus-hourly Sat May 5 05:35:44 2007
@@ -6,13 +6,13 @@
use Getopt::Long;
use vars qw(
- $corpusdir
+ $realcorpusdir
$opt_override
$opt_tag
);
GetOptions(
"tag=s" => \$opt_tag,
- "dir=s" => \$corpusdir,
+ "dir=s" => \$realcorpusdir,
"override=s" => \$opt_override,
);
@@ -51,11 +51,23 @@
&configure;
&init;
-if ($corpusdir) {
- print "reading logs from '$corpusdir'\n";
-}
-else {
- $corpusdir = $opt{corpus};
+my $corpusdir;
+
+if ($realcorpusdir) {
+ print "reading logs from '$realcorpusdir'\n";
+
+ # create a temp dir to hold hard links to the files we're working on. This
+ # is used so that the rsyncd can upload new source files, replacing
+ # our work files, without affecting us. use hard links for speed and
+ # efficiency
+ $corpusdir = "$opt{tmp}/parse.$$";
+ mkdir $corpusdir or die "cannot mkdir $corpusdir";
+ push @tmps, $corpusdir;
+ print "using $corpusdir to hold temporary links to processed logs\n";
+
+} else {
+ $realcorpusdir = $opt{corpus};
+ $corpusdir = $realcorpusdir; # no need to take copies
&update_rsync;
}
@@ -84,7 +96,7 @@
}
sub clean_up {
- system "rm -f $opt{tmp}/*.$$ ".join(' ', @tmps);
+ system "rm -rf $opt{tmp}/*.$$ ".join(' ', @tmps);
}
sub init {
@@ -97,7 +109,7 @@
}
sub update_rsync {
- chdir $corpusdir;
+ chdir $realcorpusdir;
# allow non-running of rsync under some circumstances
if ($opt{rsync_command}) {
@@ -134,19 +146,19 @@
}
sub locate {
- opendir(CORPUS, $corpusdir);
+ opendir(CORPUS, $realcorpusdir);
@files = sort readdir(CORPUS);
closedir(CORPUS);
@files = grep {
- /^(?:spam|ham)-(?:net-)?[-\w]+\.log$/ && -f "$corpusdir/$_" && -M _ < 10
+ /^(?:spam|ham)-(?:net-)?[-\w]+\.log$/ && -f "$realcorpusdir/$_" && -M _ < 10
} @files;
foreach my $file (@files) {
my $tag = 0;
my $headers = '';
- open(FILE, "$corpusdir/$file") or warn "cannot read $corpusdir/$file";
+ open(FILE, "$realcorpusdir/$file") or warn "cannot read $realcorpusdir/$file";
while (my $line = <FILE>) {
last if $line !~ /^#/;
$headers .= $line;
@@ -168,18 +180,18 @@
}
close(FILE);
- my @s = stat("$corpusdir/$file");
+ my @s = stat("$realcorpusdir/$file");
$filesize{$file} = $s[7];
$mtime{$file} = $s[9];
if (!defined $time{$file}) {
- warn "$corpusdir/$file: no time found, ignored\n"; next;
+ warn "$realcorpusdir/$file: no time found, ignored\n"; next;
}
if (!defined $revision{$file}) {
- warn "$corpusdir/$file: no revision found, ignored\n"; next;
+ warn "$realcorpusdir/$file: no revision found, ignored\n"; next;
}
if ($revision{$file} eq 'unknown') {
- warn "$corpusdir/$file: not tagged with a revision, ignored\n"; next;
+ warn "$realcorpusdir/$file: not tagged with a revision, ignored\n"; next;
}
my $daterev = mk_daterev($time{$file},$revision{$file},$opt_tag);
@@ -189,13 +201,16 @@
if ($file =~ /-net-/) {
$is_net_daterev{$daterev} = 1;
- print "$corpusdir/$file: rev=$daterev time=$time{$file} (set 1)\n";
+ print "$realcorpusdir/$file: rev=$daterev time=$time{$file} (set 1)\n";
}
else {
- print "$corpusdir/$file: rev=$daterev time=$time{$file} (set 0)\n";
+ print "$realcorpusdir/$file: rev=$daterev time=$time{$file} (set 0)\n";
}
get_rulemetadata_for_revision($daterev, $revision{$file});
+
+ link ("$realcorpusdir/$file", "$corpusdir/$file")
+ or die "cannot ln $realcorpusdir/$file to $corpusdir";
}
}
ruleqa broken [Re: svn commit: r535514 - /spamassassin/trunk/masses/rule-qa/corpus-hourly]
Posted by "Daryl C. W. O'Shea" <sp...@dostech.ca>.
jm@apache.org wrote:
> Author: jm
> Date: Sat May 5 05:35:44 2007
> New Revision: 535514
>
> URL: http://svn.apache.org/viewvc?view=rev&rev=535514
> Log:
> create temporary copies of the log files we process, to avoid race conditions where rsyncd uploads a new rev which we then think is data from an OLD rev
>
> Modified:
> spamassassin/trunk/masses/rule-qa/corpus-hourly
This or one of the revisions up to r535536 (I haven't looked at them)
seems to have made things worse.
Sunday's active.list had a lot of rules, including all the new sandbox
rules, removed from it and (probably related) at the moment ruleqa is
only showing logs from "bb-doc jm" for r535586. I see logs from the
usual submitters for this rev on the server:
ham-bb-doc.log ham-bb-jm.log ham-daf.log ham-jm.log
spam-bb-doc.log spam-bb-jm.log spam-daf.log spam-jm.log
ham-bb-fredt.log ham-bb-zmi.log ham-dos.log ham-theo.log
spam-bb-fredt.log spam-bb-zmi.log spam-dos.log spam-theo.log
Daryl