You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2014/03/13 07:47:36 UTC

svn commit: r1577038 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/pkg/PackageParser.java test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java

Author: nick
Date: Thu Mar 13 06:47:35 2014
New Revision: 1577038

URL: http://svn.apache.org/r1577038
Log:
TIKA-1243 Add 7z support now that we have upgraded to Commons Compress 1.8, but it is a little nasty until COMPRESS-269 is resolved

Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java?rev=1577038&r1=1577037&r2=1577038&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java Thu Mar 13 06:47:35 2014
@@ -25,10 +25,12 @@ import org.apache.commons.compress.archi
 import org.apache.commons.compress.archivers.ArchiveException;
 import org.apache.commons.compress.archivers.ArchiveInputStream;
 import org.apache.commons.compress.archivers.ArchiveStreamFactory;
+import org.apache.commons.compress.archivers.StreamingNotSupportedException;
 import org.apache.commons.compress.archivers.ar.ArArchiveInputStream;
 import org.apache.commons.compress.archivers.cpio.CpioArchiveInputStream;
 import org.apache.commons.compress.archivers.dump.DumpArchiveInputStream;
 import org.apache.commons.compress.archivers.jar.JarArchiveInputStream;
+import org.apache.commons.compress.archivers.sevenz.SevenZFile;
 import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
 import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
 import org.apache.tika.exception.TikaException;
@@ -69,7 +71,7 @@ public class PackageParser extends Abstr
     private static final MediaType SEVENZ = MediaType.application("x-7z-compressed");
 
     private static final Set<MediaType> SUPPORTED_TYPES =
-            MediaType.set(ZIP, JAR, AR, CPIO, DUMP, TAR);
+            MediaType.set(ZIP, JAR, AR, CPIO, DUMP, TAR, SEVENZ);
 
     static MediaType getMediaType(ArchiveInputStream stream) {
         if (stream instanceof JarArchiveInputStream) {
@@ -84,6 +86,8 @@ public class PackageParser extends Abstr
             return DUMP;
         } else if (stream instanceof TarArchiveInputStream) {
             return TAR;
+        } else if (stream instanceof SevenZWrapper) {
+            return SEVENZ;
         } else {
             return MediaType.OCTET_STREAM;
         }
@@ -107,12 +111,26 @@ public class PackageParser extends Abstr
         stream = new CloseShieldInputStream(stream);
 
         // Ensure that the stream supports the mark feature
-        stream = new BufferedInputStream(stream);
+        if (! TikaInputStream.isTikaInputStream(stream)) {
+            stream = new BufferedInputStream(stream);
+        }
 
         ArchiveInputStream ais;
         try {
             ArchiveStreamFactory factory = new ArchiveStreamFactory();
             ais = factory.createArchiveInputStream(stream);
+        } catch (StreamingNotSupportedException sne) {
+            // Most archive formats work on streams, but a few need files
+            if (sne.getFormat().equals(ArchiveStreamFactory.SEVEN_Z)) {
+                // Rework as a file, and wrap
+                stream.reset();
+                TikaInputStream tstream = TikaInputStream.get(stream);
+                
+                // Pending a fix for COMPRESS_269, this bit is a little nasty
+                ais = new SevenZWrapper(new SevenZFile(tstream.getFile()));
+            } else {
+                throw new TikaException("Unknown non-streaming format " + sne.getFormat(), sne);
+            }
         } catch (ArchiveException e) {
             throw new TikaException("Unable to unpack document stream", e);
         }
@@ -178,4 +196,29 @@ public class PackageParser extends Abstr
         }
     }
 
+    // Pending a fix for COMPRESS-269, we have to wrap ourselves
+    private static class SevenZWrapper extends ArchiveInputStream {
+        private SevenZFile file;
+        private SevenZWrapper(SevenZFile file) {
+            this.file = file;
+        }
+        
+        @Override
+        public int read() throws IOException {
+            return file.read();
+        }
+        @Override
+        public int read(byte[] b) throws IOException {
+            return file.read(b);
+        }
+        @Override
+        public int read(byte[] b, int off, int len) throws IOException {
+            return file.read(b, off, len);
+        }
+
+        @Override
+        public ArchiveEntry getNextEntry() throws IOException {
+            return file.getNextEntry();
+        }
+    }
 }

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java?rev=1577038&r1=1577037&r2=1577038&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java Thu Mar 13 06:47:35 2014
@@ -23,24 +23,30 @@ import static org.junit.Assert.assertTru
 import java.io.InputStream;
 
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.BodyContentHandler;
 import org.junit.Test;
-import org.junit.Ignore;
 import org.xml.sax.ContentHandler;
 
 /**
  * Test case for parsing 7z files.
  */
 public class Seven7ParserTest extends AbstractPkgTest {
-    @Ignore // Pending a fix of COMPRESS-267, see TIKA-1243
+    private static final MediaType TYPE_7ZIP = MediaType.application("x-7z-compressed");
+    
     @Test
     public void test7ZParsing() throws Exception {
         Parser parser = new AutoDetectParser(); // Should auto-detect!
         ContentHandler handler = new BodyContentHandler();
         Metadata metadata = new Metadata();
-
+        
+        // Ensure 7zip is a parsable format
+        assertTrue("No 7zip parser found", 
+                parser.getSupportedTypes(recursingContext).contains(TYPE_7ZIP));
+        
+        // Parse
         InputStream stream = TarParserTest.class.getResourceAsStream(
                 "/test-documents/test-documents.7z");
         try {
@@ -49,7 +55,7 @@ public class Seven7ParserTest extends Ab
             stream.close();
         }
 
-        assertEquals("application/x-7z-compressed", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals(TYPE_7ZIP.toString(), metadata.get(Metadata.CONTENT_TYPE));
         String content = handler.toString();
         assertTrue(content.contains("test-documents/testEXCEL.xls"));
         assertTrue(content.contains("Sample Excel Worksheet"));
@@ -75,7 +81,6 @@ public class Seven7ParserTest extends Ab
      * Tests that the ParseContext parser is correctly
      *  fired for all the embedded entries.
      */
-    @Ignore // Pending a fix of COMPRESS-267, see TIKA-1243
     @Test
     public void testEmbedded() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
@@ -94,7 +99,7 @@ public class Seven7ParserTest extends Ab
        assertEquals(9, tracker.filenames.size());
        assertEquals(9, tracker.mediatypes.size());
        
-       // Should have names but not content types, as tar doesn't
+       // Should have names but not content types, as 7z doesn't
        //  store the content types
        assertEquals("test-documents/testEXCEL.xls", tracker.filenames.get(0));
        assertEquals("test-documents/testHTML.html", tracker.filenames.get(1));