You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2005/11/15 03:27:27 UTC
svn commit: r344291 - /spamassassin/trunk/masses/corpora/mk-corpus-link-farm
Author: jm
Date: Mon Nov 14 18:27:24 2005
New Revision: 344291
URL: http://svn.apache.org/viewcvs?rev=344291&view=rev
Log:
fixed a few bugs in mbox extraction support
Modified:
spamassassin/trunk/masses/corpora/mk-corpus-link-farm
Modified: spamassassin/trunk/masses/corpora/mk-corpus-link-farm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/masses/corpora/mk-corpus-link-farm?rev=344291&r1=344290&r2=344291&view=diff
==============================================================================
--- spamassassin/trunk/masses/corpora/mk-corpus-link-farm (original)
+++ spamassassin/trunk/masses/corpora/mk-corpus-link-farm Mon Nov 14 18:27:24 2005
@@ -54,7 +54,7 @@
use SDBM_File;
use Fcntl;
-my $DEBUG; # $DEBUG=1;
+my $DEBUG;# $DEBUG=1;
my @classes = qw(ham spam);
my $srcs = [ ];
@@ -272,6 +272,8 @@
return;
}
+ dbg "linking from $srcdir";
+
# create a hash of modtime -> filepath, so we can be sure we pick up
# "new" files first if so desired. note that -M gives (now - modtime) in
# days, so larger numbers means earlier.
@@ -313,8 +315,10 @@
my $num = $destobj->{num};
my $destdir = $dest->{dir};
+ dbg " linking $num into $destdir";
+
my $i;
- for ($i = 0; $i <= $num; $i++)
+ for ($i = 0; $i < $num; $i++)
{
my $srcname = shift @files;
if (!$srcname) {
@@ -381,9 +385,10 @@
my ($dir) = @_;
File::Find::find(sub {
- if (-f $_) {
+ if (!-d $_) {
my $fname = $File::Find::name;
$poss_delete->{$fname} = 1;
+ dbg("marked as deleteable: $fname");
} else {
# TODO: delete dirs? for now, leave 'em behind
}
@@ -451,12 +456,11 @@
my $fromline;
while (!eof INPUT) {
my $offset = $start; # byte offset of this message
+
while (<INPUT>) {
- if ($in_header) {
- if (/^\s*$/) {
- $in_header = 0;
- }
- }
+
+nextfrom:
+ last unless defined($_);
if (substr($_,0,5) eq "From ") {
$in_header = 1;
$start = $where;
@@ -464,7 +468,6 @@
$fromline = $_;
last;
}
- $where = tell INPUT;
if (mbox_new_enough($fromline))
{
@@ -475,6 +478,15 @@
if (-f $newname && (-M _ >= -M INPUT)) {
# no need to recreate it, it's fresh
+
+ my $past = 0;
+ while (<INPUT>) {
+ if ($past) {
+ last if (!defined($_) || substr($_,0,5) eq "From ");
+ } else {
+ $past = 1;
+ }
+ }
}
else {
seek (INPUT, $offset, 0);
@@ -484,7 +496,7 @@
my $past = 0;
while (<INPUT>) {
if ($past) {
- last if substr($_,0,5) eq "From ";
+ last if (!defined($_) || substr($_,0,5) eq "From ");
} else {
$past = 1;
}
@@ -495,12 +507,16 @@
utime $atime, $mtime, $newname
or warn "failed to touch $newname";
-
- seek (INPUT, $where, 0); # back to where we were
}
push @created_files, $newname;
remove_from_poss_delete($newname);
+
+ $where = tell INPUT;
+ $offset = $where;
+
+ # we've already read the next "From " line, parse it now
+ goto nextfrom;
}
}
}
@@ -793,4 +809,8 @@
../mk-corpus-link-farm \
-dest ./out1 -num 1 -dest ./out2 -num 2 -dest ./out3 -num 5 \
src*
+
+ ../mk-corpus-link-farm \
+ -dest ./out1 -num 1 -dest ./out2 -num 2 -dest ./out3 -num 5 \
+ src1/*.mbox src2 src3