You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2005/11/15 03:27:27 UTC

svn commit: r344291 - /spamassassin/trunk/masses/corpora/mk-corpus-link-farm

Author: jm
Date: Mon Nov 14 18:27:24 2005
New Revision: 344291

URL: http://svn.apache.org/viewcvs?rev=344291&view=rev
Log:
fixed a few bugs in mbox extraction support

Modified:
    spamassassin/trunk/masses/corpora/mk-corpus-link-farm

Modified: spamassassin/trunk/masses/corpora/mk-corpus-link-farm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/masses/corpora/mk-corpus-link-farm?rev=344291&r1=344290&r2=344291&view=diff
==============================================================================
--- spamassassin/trunk/masses/corpora/mk-corpus-link-farm (original)
+++ spamassassin/trunk/masses/corpora/mk-corpus-link-farm Mon Nov 14 18:27:24 2005
@@ -54,7 +54,7 @@
 use SDBM_File;
 use Fcntl;
 
-my $DEBUG; # $DEBUG=1;
+my $DEBUG;# $DEBUG=1;
 
 my @classes = qw(ham spam);
 my $srcs = [ ];
@@ -272,6 +272,8 @@
     return;
   }
 
+  dbg "linking from $srcdir";
+
   # create a hash of modtime -> filepath, so we can be sure we pick up
   # "new" files first if so desired. note that -M gives (now - modtime) in
   # days, so larger numbers means earlier.
@@ -313,8 +315,10 @@
     my $num = $destobj->{num};
     my $destdir = $dest->{dir};
 
+    dbg "  linking $num into $destdir";
+
     my $i;
-    for ($i = 0; $i <= $num; $i++)
+    for ($i = 0; $i < $num; $i++)
     {
       my $srcname = shift @files;
       if (!$srcname) {
@@ -381,9 +385,10 @@
   my ($dir) = @_;
 
   File::Find::find(sub {
-      if (-f $_) {
+      if (!-d $_) {
         my $fname = $File::Find::name;
         $poss_delete->{$fname} = 1;
+        dbg("marked as deleteable: $fname");
       } else {
         # TODO: delete dirs?  for now, leave 'em behind
       }
@@ -451,12 +456,11 @@
   my $fromline;
   while (!eof INPUT) {
     my $offset = $start;    # byte offset of this message
+
     while (<INPUT>) {
-      if ($in_header) {
-        if (/^\s*$/) {
-          $in_header = 0;
-        }
-      }
+
+nextfrom:
+      last unless defined($_);
       if (substr($_,0,5) eq "From ") {
         $in_header = 1;
         $start = $where;
@@ -464,7 +468,6 @@
         $fromline = $_;
         last;
       }
-      $where = tell INPUT;
 
       if (mbox_new_enough($fromline))
       {
@@ -475,6 +478,15 @@
 
           if (-f $newname && (-M _ >= -M INPUT)) {
             # no need to recreate it, it's fresh
+  
+            my $past = 0;
+            while (<INPUT>) {
+              if ($past) {
+                last if (!defined($_) || substr($_,0,5) eq "From ");
+              } else {
+                $past = 1;
+              }
+            }
           }
           else {
             seek (INPUT, $offset, 0);
@@ -484,7 +496,7 @@
             my $past = 0;
             while (<INPUT>) {
               if ($past) {
-                last if substr($_,0,5) eq "From ";
+                last if (!defined($_) || substr($_,0,5) eq "From ");
               } else {
                 $past = 1;
               }
@@ -495,12 +507,16 @@
   
             utime $atime, $mtime, $newname
                             or warn "failed to touch $newname";
-  
-            seek (INPUT, $where, 0);    # back to where we were
           }
   
           push @created_files, $newname;
           remove_from_poss_delete($newname);
+
+          $where = tell INPUT;
+          $offset = $where;
+
+          # we've already read the next "From " line, parse it now
+          goto nextfrom;
         }
       }
     }
@@ -793,4 +809,8 @@
   ../mk-corpus-link-farm \
       -dest ./out1 -num 1 -dest ./out2 -num 2 -dest ./out3 -num 5 \
       src*
+
+  ../mk-corpus-link-farm \
+      -dest ./out1 -num 1 -dest ./out2 -num 2 -dest ./out3 -num 5 \
+      src1/*.mbox src2 src3