You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2014/03/13 07:47:36 UTC
svn commit: r1577038 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/pkg/PackageParser.java
test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java
Author: nick
Date: Thu Mar 13 06:47:35 2014
New Revision: 1577038
URL: http://svn.apache.org/r1577038
Log:
TIKA-1243 Add 7z support now that we have upgraded to Commons Compress 1.8, but it is a little nasty until COMPRESS-269 is resolved
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java?rev=1577038&r1=1577037&r2=1577038&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java Thu Mar 13 06:47:35 2014
@@ -25,10 +25,12 @@ import org.apache.commons.compress.archi
import org.apache.commons.compress.archivers.ArchiveException;
import org.apache.commons.compress.archivers.ArchiveInputStream;
import org.apache.commons.compress.archivers.ArchiveStreamFactory;
+import org.apache.commons.compress.archivers.StreamingNotSupportedException;
import org.apache.commons.compress.archivers.ar.ArArchiveInputStream;
import org.apache.commons.compress.archivers.cpio.CpioArchiveInputStream;
import org.apache.commons.compress.archivers.dump.DumpArchiveInputStream;
import org.apache.commons.compress.archivers.jar.JarArchiveInputStream;
+import org.apache.commons.compress.archivers.sevenz.SevenZFile;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.tika.exception.TikaException;
@@ -69,7 +71,7 @@ public class PackageParser extends Abstr
private static final MediaType SEVENZ = MediaType.application("x-7z-compressed");
private static final Set<MediaType> SUPPORTED_TYPES =
- MediaType.set(ZIP, JAR, AR, CPIO, DUMP, TAR);
+ MediaType.set(ZIP, JAR, AR, CPIO, DUMP, TAR, SEVENZ);
static MediaType getMediaType(ArchiveInputStream stream) {
if (stream instanceof JarArchiveInputStream) {
@@ -84,6 +86,8 @@ public class PackageParser extends Abstr
return DUMP;
} else if (stream instanceof TarArchiveInputStream) {
return TAR;
+ } else if (stream instanceof SevenZWrapper) {
+ return SEVENZ;
} else {
return MediaType.OCTET_STREAM;
}
@@ -107,12 +111,26 @@ public class PackageParser extends Abstr
stream = new CloseShieldInputStream(stream);
// Ensure that the stream supports the mark feature
- stream = new BufferedInputStream(stream);
+ if (! TikaInputStream.isTikaInputStream(stream)) {
+ stream = new BufferedInputStream(stream);
+ }
ArchiveInputStream ais;
try {
ArchiveStreamFactory factory = new ArchiveStreamFactory();
ais = factory.createArchiveInputStream(stream);
+ } catch (StreamingNotSupportedException sne) {
+ // Most archive formats work on streams, but a few need files
+ if (sne.getFormat().equals(ArchiveStreamFactory.SEVEN_Z)) {
+ // Rework as a file, and wrap
+ stream.reset();
+ TikaInputStream tstream = TikaInputStream.get(stream);
+
+ // Pending a fix for COMPRESS_269, this bit is a little nasty
+ ais = new SevenZWrapper(new SevenZFile(tstream.getFile()));
+ } else {
+ throw new TikaException("Unknown non-streaming format " + sne.getFormat(), sne);
+ }
} catch (ArchiveException e) {
throw new TikaException("Unable to unpack document stream", e);
}
@@ -178,4 +196,29 @@ public class PackageParser extends Abstr
}
}
+ // Pending a fix for COMPRESS-269, we have to wrap ourselves
+ private static class SevenZWrapper extends ArchiveInputStream {
+ private SevenZFile file;
+ private SevenZWrapper(SevenZFile file) {
+ this.file = file;
+ }
+
+ @Override
+ public int read() throws IOException {
+ return file.read();
+ }
+ @Override
+ public int read(byte[] b) throws IOException {
+ return file.read(b);
+ }
+ @Override
+ public int read(byte[] b, int off, int len) throws IOException {
+ return file.read(b, off, len);
+ }
+
+ @Override
+ public ArchiveEntry getNextEntry() throws IOException {
+ return file.getNextEntry();
+ }
+ }
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java?rev=1577038&r1=1577037&r2=1577038&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java Thu Mar 13 06:47:35 2014
@@ -23,24 +23,30 @@ import static org.junit.Assert.assertTru
import java.io.InputStream;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
-import org.junit.Ignore;
import org.xml.sax.ContentHandler;
/**
* Test case for parsing 7z files.
*/
public class Seven7ParserTest extends AbstractPkgTest {
- @Ignore // Pending a fix of COMPRESS-267, see TIKA-1243
+ private static final MediaType TYPE_7ZIP = MediaType.application("x-7z-compressed");
+
@Test
public void test7ZParsing() throws Exception {
Parser parser = new AutoDetectParser(); // Should auto-detect!
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
-
+
+ // Ensure 7zip is a parsable format
+ assertTrue("No 7zip parser found",
+ parser.getSupportedTypes(recursingContext).contains(TYPE_7ZIP));
+
+ // Parse
InputStream stream = TarParserTest.class.getResourceAsStream(
"/test-documents/test-documents.7z");
try {
@@ -49,7 +55,7 @@ public class Seven7ParserTest extends Ab
stream.close();
}
- assertEquals("application/x-7z-compressed", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals(TYPE_7ZIP.toString(), metadata.get(Metadata.CONTENT_TYPE));
String content = handler.toString();
assertTrue(content.contains("test-documents/testEXCEL.xls"));
assertTrue(content.contains("Sample Excel Worksheet"));
@@ -75,7 +81,6 @@ public class Seven7ParserTest extends Ab
* Tests that the ParseContext parser is correctly
* fired for all the embedded entries.
*/
- @Ignore // Pending a fix of COMPRESS-267, see TIKA-1243
@Test
public void testEmbedded() throws Exception {
Parser parser = new AutoDetectParser(); // Should auto-detect!
@@ -94,7 +99,7 @@ public class Seven7ParserTest extends Ab
assertEquals(9, tracker.filenames.size());
assertEquals(9, tracker.mediatypes.size());
- // Should have names but not content types, as tar doesn't
+ // Should have names but not content types, as 7z doesn't
// store the content types
assertEquals("test-documents/testEXCEL.xls", tracker.filenames.get(0));
assertEquals("test-documents/testHTML.html", tracker.filenames.get(1));