You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spamassassin.apache.org by he...@apache.org on 2022/04/14 11:50:32 UTC

svn commit: r1899848 - in /spamassassin/trunk: lib/Mail/SpamAssassin/ArchiveIterator.pm lib/spamassassin-run.pod sa-learn.raw

Author: hege
Date: Thu Apr 14 11:50:31 2022
New Revision: 1899848

URL: http://svn.apache.org/viewvc?rev=1899848&view=rev
Log:
Further ArchiveIterator improvements, all of gzip/bzip2/xz/lz4/lzip/lzo are now detected and uncompressed automatically.

Modified:
    spamassassin/trunk/lib/Mail/SpamAssassin/ArchiveIterator.pm
    spamassassin/trunk/lib/spamassassin-run.pod
    spamassassin/trunk/sa-learn.raw

Modified: spamassassin/trunk/lib/Mail/SpamAssassin/ArchiveIterator.pm
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/Mail/SpamAssassin/ArchiveIterator.pm?rev=1899848&r1=1899847&r2=1899848&view=diff
==============================================================================
--- spamassassin/trunk/lib/Mail/SpamAssassin/ArchiveIterator.pm (original)
+++ spamassassin/trunk/lib/Mail/SpamAssassin/ArchiveIterator.pm Thu Apr 14 11:50:31 2022
@@ -236,12 +236,11 @@ sub set_functions {
 Generates the list of messages to process, then runs each message through
 the configured wanted subroutine.
 
-Files which are detected as C<gzip> compressed (regardless of extension,
-works also on Maildir files) are uncompressed automatically via IO::Zlib. 
-Files with C<.bz2> extension are automatically uncompressed via call to
-C<bzip2 -dc> command.  Files with C<.xz> extension are automatically
-uncompressed via call to C<xz -dc> command.  Compressed mailbox/mbox files
-are not supported.
+Compressed files are detected and uncompressed automatically regardless of
+file extension.  Supported formats are C<gzip>, C<bzip2>, C<xz>, C<lz4>,
+C<lzip>, C<lzo>.  Gzip is uncompressed via IO::Zlib module, others use their
+specific command line tool (bzip2/xz/lz4/lzip/lzop).  Compressed
+mailbox/mbox files are not supported.
 
 The target_paths array is expected to be either one element per path in the
 following format: C<class:format:raw_location>, or a hash reference containing
@@ -295,8 +294,10 @@ sub run {
     return 0;
   }
 
-  $self->{bzip2_path} = Mail::SpamAssassin::Util::find_executable_in_env_path('bzip2');
-  $self->{xz_path} = Mail::SpamAssassin::Util::find_executable_in_env_path('xz');
+  # Find some uncompressors (gzip is handled with IO::Zlib)
+  foreach ('bzip2','xz','lz4','lzip','lzop') {
+    $self->{$_.'_path'} = Mail::SpamAssassin::Util::find_executable_in_env_path($_);
+  }
 
   # scan the targets and get the number and list of messages
   $self->_scan_targets(\@targets,
@@ -618,38 +619,14 @@ sub _mail_open {
   my ($self, $file, $ignore_missing) = @_;
   my $fh;
 
-  # Assume that the file by default is just a plain file
-  my @expr = ( $file );
-  my $mode = '<';
-
-  # Handle some compressed files
-  if ($file =~ /\.bz2$/i) {
-    if ($self->{bzip2_path}) {
-      $mode = '-|';
-      unshift @expr, $self->{bzip2_path}, '-cd';
-    } else {
-      warn "archive-iterator: bzip2 executable required for $file\n";
-      return;
-    }
-  }
-  elsif ($file =~ /\.xz$/i) {
-    if ($self->{xz_path}) {
-      $mode = '-|';
-      unshift @expr, $self->{xz_path}, '-cd';
-    } else {
-      warn "archive-iterator: xz executable required for $file\n";
-      return;
-    }
-  }
-
   # Go ahead and try to open the file
   # bug 5288: the "magic" version of open will strip leading and trailing
   # whitespace from the expression.  switch to the three-argument version
   # of open which does not strip whitespace.  see "perldoc -f open" and
   # "perldoc perlipc" for more information.
-  if (!open ($fh, $mode, @expr)) {
+  if (!open($fh, '<', $file)) {
     # Don't warn about disappeared files
-    if ($ignore_missing && $mode eq '<' && $! == ENOENT) {
+    if ($ignore_missing && $! == ENOENT) {
       dbg("archive-iterator: no access to $file: $!");
     } else {
       warn "archive-iterator: no access to $file: $!\n"
@@ -660,13 +637,14 @@ sub _mail_open {
   # bug 5249: mail could have 8-bit data, need this on some platforms
   binmode $fh  or die "cannot set input file to binmode: $!";
 
-  # Detect gzip compressed data (only from files)
-  if ($mode eq '<' && -f $file && read($fh, my $magic, 2)) {
-    if ($magic eq "\x1F\x8B") {
-      dbg("archive-iterator: detected gzipped file $file, reopening with IO::Zlib");
+  # Detect compressed data (only from files, can't reopen pipe)
+  if (-f $file && read($fh, my $magic, 6)) {
+    # GZIP
+    if ($magic =~ /^\x1F\x8B/) {
+      dbg("archive-iterator: detected gzip file $file, reopening with IO::Zlib");
       close $fh  or die "error closing input file: $!";
       eval { require IO::Zlib; };
-      if ($@) { warn "IO::Zlib required for $file: $@\n"; return; }
+      if ($@) { warn "archive-iterator: IO::Zlib required for $file: $@\n"; return; }
       $fh = IO::Zlib->new($file, "rb");
       if (!$fh) {
         if ($ignore_missing && $! == ENOENT) {
@@ -676,6 +654,76 @@ sub _mail_open {
         }
         return;
       }
+    }
+    # BZIP2
+    elsif ($magic =~ /^\x42\x5A(?:\x68|\x30)/) {
+      dbg("archive-iterator: detected bzip2 file $file, reopening with bzip2");
+      close $fh  or die "error closing input file: $!";
+      if (!$self->{bzip2_path}) {
+        warn "archive-iterator: bzip2 executable required for $file\n";
+        return;
+      }
+      if (!open($fh, '-|', $self->{bzip2_path}, '-cd', $file)) {
+        warn "archive-iterator: no access to $file: $!\n";
+        return;
+      }
+      binmode $fh  or die "cannot set input file to binmode: $!";
+    }
+    # XZ
+    elsif ($magic =~ /^\xFD\x37\x7A\x58\x5A\x00/) {
+      dbg("archive-iterator: detected xz file $file, reopening with xz");
+      close $fh  or die "error closing input file: $!";
+      if (!$self->{xz_path}) {
+        warn "archive-iterator: xz executable required for $file\n";
+        return;
+      }
+      if (!open($fh, '-|', $self->{xz_path}, '-cd', $file)) {
+        warn "archive-iterator: no access to $file: $!\n";
+        return;
+      }
+      binmode $fh  or die "cannot set input file to binmode: $!";
+    }
+    # LZ4
+    elsif ($magic =~ /^\x04\x22\x4D\x18/) {
+      dbg("archive-iterator: detected lz4 file $file, reopening with lz4");
+      close $fh  or die "error closing input file: $!";
+      if (!$self->{lz4_path}) {
+        warn "archive-iterator: lz4 executable required for $file\n";
+        return;
+      }
+      if (!open($fh, '-|', $self->{lz4_path}, '-cd', $file)) {
+        warn "archive-iterator: no access to $file: $!\n";
+        return;
+      }
+      binmode $fh  or die "cannot set input file to binmode: $!";
+    }
+    # LZIP
+    elsif ($magic =~ /^\x4C\x5A\x49\x50/) {
+      dbg("archive-iterator: detected lzip file $file, reopening with lzip");
+      close $fh  or die "error closing input file: $!";
+      if (!$self->{lzip_path}) {
+        warn "archive-iterator: lzip executable required for $file\n";
+        return;
+      }
+      if (!open($fh, '-|', $self->{lzip_path}, '-cd', $file)) {
+        warn "archive-iterator: no access to $file: $!\n";
+        return;
+      }
+      binmode $fh  or die "cannot set input file to binmode: $!";
+    }
+    # LZO
+    elsif ($magic =~ /^\x89\x4C\x5A\x4F\x00\x0D/) {
+      dbg("archive-iterator: detected lzo file $file, reopening with lzop");
+      close $fh  or die "error closing input file: $!";
+      if (!$self->{lzop_path}) {
+        warn "archive-iterator: lzop executable required for $file\n";
+        return;
+      }
+      if (!open($fh, '-|', $self->{lzop_path}, '-cd', $file)) {
+        warn "archive-iterator: no access to $file: $!\n";
+        return;
+      }
+      binmode $fh  or die "cannot set input file to binmode: $!";
     } else {
       # Reset position
       seek($fh,0,0);

Modified: spamassassin/trunk/lib/spamassassin-run.pod
URL: http://svn.apache.org/viewvc/spamassassin/trunk/lib/spamassassin-run.pod?rev=1899848&r1=1899847&r2=1899848&view=diff
==============================================================================
--- spamassassin/trunk/lib/spamassassin-run.pod (original)
+++ spamassassin/trunk/lib/spamassassin-run.pod Thu Apr 14 11:50:31 2022
@@ -87,8 +87,8 @@ containing whitespace or beginning with
 The options I<--mbox> and I<--mbx> can override the assumed format,
 see the appropriate OPTION information below.
 
-Files compressed with gzip/bzip2/xz are uncompressed automatically.  See
-C<Mail::SpamAssassin::ArchiveIterator> for more details.
+Files compressed with gzip/bzip2/xz/lz4/lzip/lzo are uncompressed
+automatically.  See C<Mail::SpamAssassin::ArchiveIterator> for more details.
 
 Please note that SpamAssassin is not designed to scan large
 messages. Don't feed messages larger than about 500 KB to

Modified: spamassassin/trunk/sa-learn.raw
URL: http://svn.apache.org/viewvc/spamassassin/trunk/sa-learn.raw?rev=1899848&r1=1899847&r2=1899848&view=diff
==============================================================================
--- spamassassin/trunk/sa-learn.raw (original)
+++ spamassassin/trunk/sa-learn.raw Thu Apr 14 11:50:31 2022
@@ -679,8 +679,8 @@ that matches.  See C<Mail::SpamAssassin:
 If you are using mail boxes in format other than maildir you should use
 the B<--mbox> or B<--mbx> parameters.
 
-Files compressed with gzip/bzip2/xz are uncompressed automatically.  See
-C<Mail::SpamAssassin::ArchiveIterator> for more details.
+Files compressed with gzip/bzip2/xz/lz4/lzip/lzo are uncompressed
+automatically.  See C<Mail::SpamAssassin::ArchiveIterator> for more details.
 
 SpamAssassin remembers which mail messages it has learnt already, and will not
 re-learn those messages again, unless you use the B<--forget> option. Messages