You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by jm...@apache.org on 2005/10/31 02:41:32 UTC
svn commit: r329698 - /spamassassin/trunk/masses/corpora/mk-corpus-link-farm
Author: jm
Date: Sun Oct 30 17:41:31 2005
New Revision: 329698
URL: http://svn.apache.org/viewcvs?rev=329698&view=rev
Log:
add mbox-extraction support
Modified:
spamassassin/trunk/masses/corpora/mk-corpus-link-farm
Modified: spamassassin/trunk/masses/corpora/mk-corpus-link-farm
URL: http://svn.apache.org/viewcvs/spamassassin/trunk/masses/corpora/mk-corpus-link-farm?rev=329698&r1=329697&r2=329698&view=diff
==============================================================================
--- spamassassin/trunk/masses/corpora/mk-corpus-link-farm (original)
+++ spamassassin/trunk/masses/corpora/mk-corpus-link-farm Sun Oct 30 17:41:31 2005
@@ -49,6 +49,8 @@
my @classes = qw(ham spam);
my $srcs = [ ];
my $dests = [ ];
+my $mbox_tmpdir = $ENV{TMPDIR} || "/tmp";
+
sub dbg;
use Getopt::Long;
@@ -93,6 +95,12 @@
# { dests => [ ], dir => "/src3/spam", num => 500 }, } ];
my $cwd = cwd();
+
+my $mbox_work = "$mbox_tmpdir/mboxes.d";
+if (-d $mbox_work) {
+ rmtree($mbox_work) or warn "cannot rmtree $mbox_work: $!";
+}
+
main();
exit;
@@ -109,7 +117,13 @@
foreach my $src (@$srcs) {
my $num_files;
my $cb = sub {
- if (-f $_ && -r _) { $num_files++; }
+ if (-f $_ && -r _) {
+ if ($_ =~ /\.mbox/) {
+ $num_files += mbox_count($File::Find::name);
+ } else {
+ $num_files++;
+ }
+ }
};
$src->{ham} = { num => 0, dests => [ ] };
@@ -210,7 +224,11 @@
if (!exists $files{$mtime}) {
$files{$mtime} = [ ];
}
- push(@{$files{$mtime}}, $File::Find::name);
+ if ($_ =~ /\.mbox/) {
+ push(@{$files{$mtime}}, mbox_extract_all($File::Find::name));
+ } else {
+ push(@{$files{$mtime}}, $File::Find::name);
+ }
}, $srcdir);
@@ -281,6 +299,97 @@
$src->{$class}{num} -= $$nhamref;
$$nhamref = 0;
}
+}
+
+sub mbox_count {
+ my ($mboxpath) = @_;
+ return _mbox_extract_all($mboxpath, 1);
+}
+
+sub mbox_extract_all {
+ my ($mboxpath) = @_;
+ return _mbox_extract_all($mboxpath, 0);
+}
+
+sub _mbox_extract_all {
+ my ($mboxpath, $justcount) = @_;
+
+ # create an area to hold extracted mbox files
+ # this cannot use $$, it must remain the same between runs
+ if (!-d $mbox_work) {
+ mkdir $mbox_work or die "cannot create tmpdir: $mbox_work";
+ # fatal error, could be an attack
+ }
+
+ my $counter = 0;
+ my @created_files = ();
+
+ open (INPUT, "<$mboxpath") or die "cannot read $mboxpath";
+ binmode INPUT;
+
+ my $start = 0; # start of a message
+ my $where = 0; # current byte offset
+ my $in_header = 0; # are in we a header?
+ while (!eof INPUT) {
+ my $offset = $start; # byte offset of this message
+ while (<INPUT>) {
+ if ($in_header) {
+ if (/^\s*$/) {
+ $in_header = 0;
+ }
+ }
+ if (substr($_,0,5) eq "From ") {
+ $in_header = 1;
+ $start = $where;
+ $where = tell INPUT;
+ last;
+ }
+ $where = tell INPUT;
+
+ if ($justcount) {
+ $counter++;
+ }
+ else {
+ my $newname = get_mbox_name ($mboxpath, $offset);
+ seek (INPUT, $offset, 0);
+ open (OUTPUT, ">$newname") or die "cannot write to $newname";
+ binmode OUTPUT;
+
+ my $past = 0;
+ while (<INPUT>) {
+ if ($past) {
+ last if substr($_,0,5) eq "From ";
+ } else {
+ $past = 1;
+ }
+ print OUTPUT;
+ }
+
+ close OUTPUT or die "failed to write to $newname";
+ push @created_files, $newname;
+ seek (INPUT, $where, 0); # back to where we were
+ }
+ }
+ }
+ close INPUT;
+
+ if ($justcount) {
+ return $counter;
+ }
+ else {
+ return @created_files;
+ }
+}
+
+sub get_mbox_name {
+ my ($mboxpath, $where) = @_;
+
+ my $dstname = $mboxpath;
+ $dstname =~ s/[^-_\.A-Za-z0-9]/_/gs;
+ $dstname =~ s/_+/_/gs;
+ $dstname =~ s/^_//gs;
+ $dstname = $mbox_work."/".$dstname.".OFF".$where;
+ return $dstname;
}
sub dbg {