You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2005/12/10 00:49:36 UTC

svn commit: r355654 - /spamassassin/trunk/masses/corpora/mk-corpus-link-farm

Author: jm
Date: Fri Dec  9 15:49:34 2005
New Revision: 355654

URL: http://svn.apache.org/viewcvs?rev=355654&view=rev
Log:
cope with mboxes that start with blank lines

Modified:
    spamassassin/trunk/masses/corpora/mk-corpus-link-farm

Modified: spamassassin/trunk/masses/corpora/mk-corpus-link-farm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/masses/corpora/mk-corpus-link-farm?rev=355654&r1=355653&r2=355654&view=diff
==============================================================================
--- spamassassin/trunk/masses/corpora/mk-corpus-link-farm (original)
+++ spamassassin/trunk/masses/corpora/mk-corpus-link-farm Fri Dec  9 15:49:34 2005
@@ -473,6 +473,7 @@
   my $where = 0;            # current byte offset
   my $in_header = 0;        # are in we a header?
   my $fromline;
+
   while (!eof INPUT) {
     my $offset = $start;    # byte offset of this message
 
@@ -487,58 +488,61 @@
         $fromline = $_;
         last;
       }
+    }
+    last unless defined($_);
+
+    # dbg "mbox From: $counter $start $where $fromline";
 
-      if (mbox_new_enough($fromline))
-      {
-        $counter++;
-
-        if (!$justcount) {
-          $newname = get_mbox_name ($mboxpath, $offset);
-
-          if (-f $newname && (-M _ >= -M INPUT)) {
-            # no need to recreate it, it's fresh
-  
-            my $past = 0;
-            while (<INPUT>) {
-              if ($past) {
-                last if (!defined($_) || substr($_,0,5) eq "From ");
-              } else {
-                $past = 1;
-              }
+    if ($fromline && mbox_new_enough($fromline))
+    {
+      $counter++;
+
+      if (!$justcount) {
+        $newname = get_mbox_name ($mboxpath, $offset);
+
+        if (-f $newname && (-M _ >= -M INPUT)) {
+          # no need to recreate it, it's fresh
+
+          my $past = 0;
+          while (<INPUT>) {
+            if ($past) {
+              last if (!defined($_) || substr($_,0,5) eq "From ");
+            } else {
+              $past = 1;
             }
           }
-          else {
-            seek (INPUT, $offset, 0);
-            open (OUTPUT, ">$newname") or die "cannot write to $newname";
-            binmode OUTPUT;
-  
-            my $past = 0;
-            while (<INPUT>) {
-              if ($past) {
-                last if (!defined($_) || substr($_,0,5) eq "From ");
-              } else {
-                $past = 1;
-              }
-              print OUTPUT;
+        }
+        else {
+          seek (INPUT, $where, 0);
+          open (OUTPUT, ">$newname") or die "cannot write to $newname";
+          binmode OUTPUT;
+
+          my $past = 0;
+          while (<INPUT>) {
+            if ($past) {
+              last if (!defined($_) || substr($_,0,5) eq "From ");
+            } else {
+              $past = 1;
             }
-  
-            close OUTPUT or die "failed to write to $newname";
-  
-            chmod 0644, $newname or warn "cannot chmod $newname";
-
-            utime $atime, $mtime, $newname
-                            or warn "failed to touch $newname";
+            print OUTPUT;
           }
-  
-          push @created_files, $newname;
-          remove_from_poss_delete($newname);
 
-          $where = tell INPUT;
-          $offset = $where;
+          close OUTPUT or die "failed to write to $newname";
 
-          # we've already read the next "From " line, parse it now
-          goto nextfrom;
+          chmod 0644, $newname or warn "cannot chmod $newname";
+
+          utime $atime, $mtime, $newname
+                          or warn "failed to touch $newname";
         }
+
+        push @created_files, $newname;
+        remove_from_poss_delete($newname);
+
+        $where = tell INPUT;
+        $offset = $where;
+
+        # we've already read the next "From " line, parse it now
+        goto nextfrom;
       }
     }
   }
@@ -571,7 +575,7 @@
   my ($fromline) = @_;
 
   # From xscludshmkjgc@yahoo.com  Thu Apr 29 20:02:18 2004
-  return unless ($fromline =~ /^From \S+  (.*)$/);
+  return unless ($fromline && $fromline =~ /^From \S+ +(.*)$/);
 
   $fromline = $1;
   $fromline .= " ".local_tz() unless $fromline =~ /(?:[-+]\d{4}|\b[A-Z]{2,4}\b)/;