You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2005/11/12 04:03:32 UTC

svn commit: r332721 - /spamassassin/trunk/masses/corpora/mk-corpus-link-farm

Author: jm
Date: Fri Nov 11 19:03:31 2005
New Revision: 332721

URL: http://svn.apache.org/viewcvs?rev=332721&view=rev
Log:
add support for --after switch, to reduce the massive volumes of mass-checkable mail

Modified:
    spamassassin/trunk/masses/corpora/mk-corpus-link-farm

Modified: spamassassin/trunk/masses/corpora/mk-corpus-link-farm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/masses/corpora/mk-corpus-link-farm?rev=332721&r1=332720&r2=332721&view=diff
==============================================================================
--- spamassassin/trunk/masses/corpora/mk-corpus-link-farm (original)
+++ spamassassin/trunk/masses/corpora/mk-corpus-link-farm Fri Nov 11 19:03:31 2005
@@ -34,11 +34,16 @@
   -dest outputdir [-num num]
 
 options:
-  -most_recent:         select the most recent messages (default)
+  -most_recent:  select the most recent messages (default)
+  -after=N       only test mails received after time_t N (negative values
+                 are an offset from current time, e.g. -86400 = last day)
+                 or after date as parsed by Time::ParseDate (e.g. '-6 months')
 
 ";
 }
 
+use Time::ParseDate;
+
 use Cwd;
 use File::Path;
 use File::Find;
@@ -59,12 +64,11 @@
 use Getopt::Long;
 use vars qw(
 
-  $opt_most_recent $opt_reuse
+  $opt_most_recent $opt_after $opt_before
 
 );
 
 $opt_most_recent = 0;
-$opt_reuse = 0;
 
 my $curdest;
 GetOptions(
@@ -84,12 +88,25 @@
     },
 
     'most_recent' => \$opt_most_recent,
+    'before=s' => \$opt_before,
+    'after=s' => \$opt_after,
 ) or usage();
 
 foreach my $arg (@ARGV) {
   push (@$srcs, { dir => $arg });
 }
 
+# Deal with --before and --after
+foreach my $time ($opt_before, $opt_after) {
+  if ($time && $time =~ /^-\d+$/) {
+    $time = time + $time;
+  }
+  elsif ($time && $time !~ /^-?\d+$/) {
+    $time = Time::ParseDate::parsedate($time, GMT => 1, PREFER_PAST => 1);
+  }
+}
+
+
 # test data: $srcs = [ { dir => "/src1", ham => { dests => [ ], dir =>
 # "/src1/ham", num => 100 }, spam => { dests => [ ], dir => "/src1/spam", num
 # => 100 }, }, { dir => "/src2", ham => { dests => [ ], dir => "/src2/ham", num
@@ -258,7 +275,11 @@
   my %files = ();
   File::Find::find(sub {
     return unless (-f $_ && -r _);    # not a file
-    my $mtime = (-M _);
+
+    my @stat = stat _;
+    my $mtime = $stat[9];
+    return unless message_is_useful_by_date($mtime);
+
     if (!exists $files{$mtime}) {
       $files{$mtime} = [ ];
     }
@@ -271,7 +292,7 @@
   }, $srcdir);
 
   my @files = ();
-  foreach my $key (sort { $a <=> $b } keys %files) {
+  foreach my $key (sort { $b <=> $a } keys %files) {
     push (@files, @{$files{$key}});
   }
   undef %files;     # no longer need that
@@ -423,6 +444,7 @@
   my $start = 0;            # start of a message
   my $where = 0;            # current byte offset
   my $in_header = 0;        # are in we a header?
+  my $fromline;
   while (!eof INPUT) {
     my $offset = $start;    # byte offset of this message
     while (<INPUT>) {
@@ -435,43 +457,47 @@
         $in_header = 1;
         $start = $where;
         $where = tell INPUT;
+        $fromline = $_;
         last;
       }
       $where = tell INPUT;
 
-      $counter++;
+      if (mbox_new_enough($fromline))
+      {
+        $counter++;
 
-      if (!$justcount) {
-        $newname = get_mbox_name ($mboxpath, $offset);
+        if (!$justcount) {
+          $newname = get_mbox_name ($mboxpath, $offset);
 
-        if (-f $newname && (-M _ >= -M INPUT)) {
-          # no need to recreate it, it's fresh
-        }
-        else {
-          seek (INPUT, $offset, 0);
-          open (OUTPUT, ">$newname") or die "cannot write to $newname";
-          binmode OUTPUT;
- 
-          my $past = 0;
-          while (<INPUT>) {
-            if ($past) {
-              last if substr($_,0,5) eq "From ";
-            } else {
-              $past = 1;
+          if (-f $newname && (-M _ >= -M INPUT)) {
+            # no need to recreate it, it's fresh
+          }
+          else {
+            seek (INPUT, $offset, 0);
+            open (OUTPUT, ">$newname") or die "cannot write to $newname";
+            binmode OUTPUT;
+  
+            my $past = 0;
+            while (<INPUT>) {
+              if ($past) {
+                last if substr($_,0,5) eq "From ";
+              } else {
+                $past = 1;
+              }
+              print OUTPUT;
             }
-            print OUTPUT;
+  
+            close OUTPUT or die "failed to write to $newname";
+  
+            utime $atime, $mtime, $newname
+                            or warn "failed to touch $newname";
+  
+            seek (INPUT, $where, 0);    # back to where we were
           }
- 
-          close OUTPUT or die "failed to write to $newname";
- 
-          utime $atime, $mtime, $newname
-                          or warn "failed to touch $newname";
- 
-          seek (INPUT, $where, 0);    # back to where we were
+  
+          push @created_files, $newname;
+          remove_from_poss_delete($newname);
         }
- 
-        push @created_files, $newname;
-        remove_from_poss_delete($newname);
       }
     }
   }
@@ -497,11 +523,226 @@
   return $dstname;
 }
 
+sub mbox_new_enough {
+  my ($fromline) = @_;
+
+  # From xscludshmkjgc@yahoo.com  Thu Apr 29 20:02:18 2004
+  $fromline .= " ".local_tz() unless $fromline =~ /(?:[-+]\d{4}|\b[A-Z]{2,4}\b)/;
+  my $time = first_date($fromline);
+  return $time;
+}
+
+sub message_is_useful_by_date {
+  my ($self, $date) = @_;
+
+  return 0 unless $date;        # undef or 0 date = unusable
+
+  if (!$opt_after && !$opt_before) {
+    # Not using the feature
+    return 1;
+  }
+  elsif (!$opt_before) {
+    # Just care about after
+    return $date > $opt_after;
+  }
+  else {
+    return (($date < $opt_before) && ($date > $opt_after));
+  }
+}
+
 sub dbg {
   return unless $DEBUG;
   warn "debug: ".join("", @_)."\n";
 }
 
+sub first_date {
+  my (@strings) = @_;
+
+  foreach my $string (@strings) {
+    my $time = parse_rfc822_date($string);
+    return $time if defined($time) && $time;
+  }
+  return undef;
+}
+
+###########################################################################
+
+# timezone mappings: in case of conflicts, use RFC 2822, then most
+# common and least conflicting mapping
+my %TZ = (
+	# standard
+	'UT'   => '+0000',
+	'UTC'  => '+0000',
+	# US and Canada
+	'NDT'  => '-0230',
+	'AST'  => '-0400',
+	'ADT'  => '-0300',
+	'NST'  => '-0330',
+	'EST'  => '-0500',
+	'EDT'  => '-0400',
+	'CST'  => '-0600',
+	'CDT'  => '-0500',
+	'MST'  => '-0700',
+	'MDT'  => '-0600',
+	'PST'  => '-0800',
+	'PDT'  => '-0700',
+	'HST'  => '-1000',
+	'AKST' => '-0900',
+	'AKDT' => '-0800',
+	'HADT' => '-0900',
+	'HAST' => '-1000',
+	# Europe
+	'GMT'  => '+0000',
+	'BST'  => '+0100',
+	'IST'  => '+0100',
+	'WET'  => '+0000',
+	'WEST' => '+0100',
+	'CET'  => '+0100',
+	'CEST' => '+0200',
+	'EET'  => '+0200',
+	'EEST' => '+0300',
+	'MSK'  => '+0300',
+	'MSD'  => '+0400',
+	'MET'  => '+0100',
+	'MEZ'  => '+0100',
+	'MEST' => '+0200',
+	'MESZ' => '+0200',
+	# South America
+	'BRST' => '-0200',
+	'BRT'  => '-0300',
+	# Australia
+	'AEST' => '+1000',
+	'AEDT' => '+1100',
+	'ACST' => '+0930',
+	'ACDT' => '+1030',
+	'AWST' => '+0800',
+	# New Zealand
+	'NZST' => '+1200',
+	'NZDT' => '+1300',
+	# Asia
+	'JST'  => '+0900',
+	'KST'  => '+0900',
+	'HKT'  => '+0800',
+	'SGT'  => '+0800',
+	'PHT'  => '+0800',
+	# Middle East
+	'IDT'  => '+0300',
+	);
+
+# month mappings
+my %MONTH = (jan => 1, feb => 2, mar => 3, apr => 4, may => 5, jun => 6,
+	     jul => 7, aug => 8, sep => 9, oct => 10, nov => 11, dec => 12);
+
+sub local_tz {
+  # standard method for determining local timezone
+  my $time = time;
+  my @g = gmtime($time);
+  my @t = localtime($time);
+  my $z = $t[1]-$g[1]+($t[2]-$g[2])*60+($t[7]-$g[7])*1440+($t[5]-$g[5])*525600;
+  return sprintf("%+.2d%.2d", $z/60, $z%60);
+}
+
+sub parse_rfc822_date {
+  my ($date) = @_;
+  local ($_);
+  my ($yyyy, $mmm, $dd, $hh, $mm, $ss, $mon, $tzoff);
+
+  # make it a bit easier to match
+  $_ = " $date "; s/, */ /gs; s/\s+/ /gs;
+
+  # now match it in parts.  Date part first:
+  if (s/ (\d+) (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) (\d{4}) / /i) {
+    $dd = $1; $mon = lc($2); $yyyy = $3;
+  } elsif (s/ (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) +(\d+) \d+:\d+:\d+ (\d{4}) / /i) {
+    $dd = $2; $mon = lc($1); $yyyy = $3;
+  } elsif (s/ (\d+) (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) (\d{2,3}) / /i) {
+    $dd = $1; $mon = lc($2); $yyyy = $3;
+  } else {
+    dbg("util: time cannot be parsed: $date");
+    return undef;
+  }
+
+  # handle two and three digit dates as specified by RFC 2822
+  if (defined $yyyy) {
+    if (length($yyyy) == 2 && $yyyy < 50) {
+      $yyyy += 2000;
+    }
+    elsif (length($yyyy) != 4) {
+      # three digit years and two digit years with values between 50 and 99
+      $yyyy += 1900;
+    }
+  }
+
+  # hh:mm:ss
+  if (s/ (\d?\d):(\d\d)(:(\d\d))? / /) {
+    $hh = $1; $mm = $2; $ss = $4 || 0;
+  }
+
+  # numeric timezones
+  if (s/ ([-+]\d{4}) / /) {
+    $tzoff = $1;
+  }
+  # common timezones
+  elsif (s/\b([A-Z]{2,4}(?:-DST)?)\b/ / && exists $TZ{$1}) {
+    $tzoff = $TZ{$1};
+  }
+  # all other timezones are considered equivalent to "-0000"
+  $tzoff ||= '-0000';
+
+  # months
+  if (exists $MONTH{$mon}) {
+    $mmm = $MONTH{$mon};
+  }
+
+  $hh ||= 0; $mm ||= 0; $ss ||= 0; $dd ||= 0; $mmm ||= 0; $yyyy ||= 0;
+
+  # Time::Local (v1.10 at least) throws warnings when the dates cause
+  # a 32-bit overflow.  So force a min/max for year.
+  if ($yyyy > 2037) {
+    dbg("util: date after supported range, forcing year to 2037: $date");
+    $yyyy = 2037;
+  }
+  elsif ($yyyy < 1970) {
+    dbg("util: date before supported range, forcing year to 1970: $date");
+    $yyyy = 1971;
+  }
+
+  my $time;
+  eval {		# could croak
+    $time = timegm($ss, $mm, $hh, $dd, $mmm-1, $yyyy);
+  };
+
+  if ($@) {
+    dbg("util: time cannot be parsed: $date, $yyyy-$mmm-$dd $hh:$mm:$ss");
+    return undef;
+  }
+
+  if ($tzoff =~ /([-+])(\d\d)(\d\d)$/)	# convert to seconds difference
+  {
+    $tzoff = (($2 * 60) + $3) * 60;
+    if ($1 eq '-') {
+      $time += $tzoff;
+    } else {
+      $time -= $tzoff;
+    }
+  }
+
+  return $time;
+}
+
+sub time_to_rfc822_date {
+  my($time) = @_;
+
+  my @days = qw/Sun Mon Tue Wed Thu Fri Sat/;
+  my @months = qw/Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec/;
+  my @localtime = localtime($time || time);
+  $localtime[5]+=1900;
+
+  sprintf("%s, %02d %s %4d %02d:%02d:%02d %s", $days[$localtime[6]], $localtime[3],
+    $months[$localtime[4]], @localtime[5,2,1,0], local_tz());
+}
+
+###########################################################################
 
 __DATA__