You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/08/17 17:49:46 UTC
[tika] branch branch_2x updated: Proof of concept for
tika-parser-integration-tests module
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_2x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_2x by this push:
new a8677bc Proof of concept for tika-parser-integration-tests module
a8677bc is described below
commit a8677bc4afd699641b2a12085a1abaecc0aca200
Author: tallison <ta...@apache.org>
AuthorDate: Mon Aug 17 13:49:25 2020 -0400
Proof of concept for tika-parser-integration-tests module
---
pom.xml | 2 +-
.../apache/tika/sax/StoppingEarlyException.java | 24 +--
tika-parser-integration-tests/pom.xml | 63 +++++++
.../apache/tika/parser/tests/pkg/ArParserTest.java | 62 +++----
.../tika/parser/tests}/pkg/Bzip2ParserTest.java | 42 +----
.../pkg/CompositeZipContainerDetectorTest.java | 19 +-
.../tika/parser/tests}/pkg/CompressParserTest.java | 52 +-----
.../parser/tests/pkg/CompressorParserTest.java | 65 +++++++
.../tika/parser/tests}/pkg/GzipParserTest.java | 41 +----
.../tika/parser/tests/pkg/RarParserTest.java | 55 ++----
.../tika/parser/tests}/pkg/Seven7ParserTest.java | 62 +------
.../tika/parser/tests/pkg/TarParserTest.java | 57 ++----
.../tika/parser/tests}/pkg/ZipParserTest.java | 122 ++----------
.../tika/parser/tests}/pkg/ZlibParserTest.java | 37 +---
.../detect/microsoft/ooxml/OPCPackageDetector.java | 4 +-
...rg.apache.tika.detect.zip.ZipContainerDetector} | 0
tika-parser-modules/tika-parser-pkg-module/pom.xml | 8 -
.../org/apache/tika/parser/pkg/ArParserTest.java | 27 ---
.../apache/tika/parser/pkg/Bzip2ParserTest.java | 33 ----
.../apache/tika/parser/pkg/CompressParserTest.java | 39 ----
.../tika/parser/pkg/CompressorParserTest.java | 32 ----
.../org/apache/tika/parser/pkg/GzipParserTest.java | 52 +-----
.../org/apache/tika/parser/pkg/RarParserTest.java | 32 ----
.../apache/tika/parser/pkg/Seven7ParserTest.java | 141 --------------
.../org/apache/tika/parser/pkg/TarParserTest.java | 32 ----
.../org/apache/tika/parser/pkg/ZipParserTest.java | 94 ----------
.../org/apache/tika/parser/pkg/ZlibParserTest.java | 15 --
.../resources/test-documents/full_encrypted.7z | Bin
.../src/test/resources/test-documents/moby.zip | Bin
.../test-documents/test7Z_protected_passTika.7z | Bin
.../test-documents/testBROTLI_compressed.br | Bin
.../test/resources/test-documents/testEmbedded.zip | Bin
.../resources/test-documents/testJAR_with_HTML.jar | Bin
.../test-documents/testJAR_with_PEHDR.jar | Bin
.../resources/test-documents/testSnappy-framed.sz | Bin
.../src/test/resources/test-documents/testTXT.zlib | Bin
.../test/resources/test-documents/testTXT.zlib0 | Bin
.../test/resources/test-documents/testTXT.zlib5 | Bin
.../test/resources/test-documents/testTXT.zlib9 | Bin
.../tika-parser-text-module/pom.xml | 18 ++
...ector.java => DefaultZipContainerDetector.java} | 69 +++++--
...> DeprecatedStreamingZipContainerDetector.java} | 2 +-
.../org/apache/tika/detect/zip/IPADetector.java | 54 +++++-
.../org/apache/tika/detect/zip/JarDetector.java | 50 ++++-
.../org/apache/tika/detect/zip/KMZDetector.java | 48 ++++-
.../tika/detect/zip/OpenDocumentDetector.java | 22 ++-
.../apache/tika/detect/zip/StarOfficeDetector.java | 97 +++++++++-
.../tika/detect/zip/StreamingDetectContext.java | 62 +++++++
.../tika/detect/zip/ZipContainerDetector.java | 204 +++------------------
.../tika/detect/zip/ZipContainerDetectorBase.java | 69 +------
.../services/org.apache.tika.detect.Detector | 15 ++
...org.apache.tika.detect.zip.ZipContainerDetector | 19 ++
.../org.apache.tika.detect.zip.ZipDetector | 5 -
.../org/apache/tika/detect/zip/ZipParserTest.java} | 37 ++--
.../src/test/resources/test-documents/testJAR.jar | Bin
.../src/test/resources/test-documents/testKMZ.kmz | Bin
.../parser/fork/ForkParserIntegrationTest.java | 4 +-
57 files changed, 744 insertions(+), 1243 deletions(-)
diff --git a/pom.xml b/pom.xml
index e8e6cc7..eec4d3e 100644
--- a/pom.xml
+++ b/pom.xml
@@ -39,6 +39,7 @@
<module>tika-core</module>
<module>tika-parsers</module>
<module>tika-parser-modules</module>
+ <module>tika-parser-integration-tests</module>
<module>tika-bundle</module>
<module>tika-xmp</module>
<module>tika-serialization</module>
@@ -53,7 +54,6 @@
<module>tika-eval</module>
<module>tika-dl</module>
<module>tika-nlp</module>
- <module>tika-parser-zip-commons</module>
</modules>
<profiles>
diff --git a/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/ZipDetector.java b/tika-core/src/main/java/org/apache/tika/sax/StoppingEarlyException.java
similarity index 59%
rename from tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/ZipDetector.java
rename to tika-core/src/main/java/org/apache/tika/sax/StoppingEarlyException.java
index 07a6c9b..c79dd80 100644
--- a/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/ZipDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/StoppingEarlyException.java
@@ -14,23 +14,17 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.tika.detect.zip;
-import org.apache.commons.compress.archivers.zip.ZipFile;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.mime.MediaType;
+package org.apache.tika.sax;
-import java.io.IOException;
+import org.xml.sax.SAXException;
-public interface ZipDetector {
+/**
+ * Sentinel exception to stop parsing xml once target is found
+ * while SAX parsing. This should be used when the parse
+ * can be stopped and the exception ignored.
+ */
+public class StoppingEarlyException extends SAXException {
- /**
- * If detection is successful, the ZipDetector should set the zip
- * file or OPCPackage in TikaInputStream.setOpenContainer()
- * @param zipFile
- * @param tis
- * @return
- * @throws IOException
- */
- MediaType detect(ZipFile zipFile, TikaInputStream tis) throws IOException;
+ public static final StoppingEarlyException INSTANCE = new StoppingEarlyException();
}
diff --git a/tika-parser-integration-tests/pom.xml b/tika-parser-integration-tests/pom.xml
new file mode 100644
index 0000000..d951902
--- /dev/null
+++ b/tika-parser-integration-tests/pom.xml
@@ -0,0 +1,63 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <parent>
+ <artifactId>tika-parent</artifactId>
+ <groupId>org.apache.tika</groupId>
+ <version>2.0.0-SNAPSHOT</version>
+ <relativePath>../tika-parent/pom.xml</relativePath>
+ </parent>
+ <modelVersion>4.0.0</modelVersion>
+
+ <artifactId>tika-parser-integration-tests</artifactId>
+
+ <dependencies>
+ <!-- test dependencies -->
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-pkg-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-text-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-pkg-module</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-text-module</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+</project>
\ No newline at end of file
diff --git a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZlibParserTest.java b/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/pkg/ArParserTest.java
similarity index 52%
copy from tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZlibParserTest.java
copy to tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/pkg/ArParserTest.java
index 5d2faaf..d977b64 100644
--- a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZlibParserTest.java
+++ b/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/pkg/ArParserTest.java
@@ -1,4 +1,4 @@
-/*
+/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -14,60 +14,48 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.tika.parser.pkg;
-import static org.junit.Assert.assertEquals;
-
-import java.io.InputStream;
+package org.apache.tika.parser.tests.pkg;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.pkg.AbstractPkgTest;
import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
import org.xml.sax.ContentHandler;
-/**
- * Test case for parsing zlib compressed
- */
-public class ZlibParserTest extends AbstractPkgTest {
+import java.io.InputStream;
+
+import static org.junit.Assert.assertEquals;
+
+
+public class ArParserTest extends AbstractPkgTest {
+
@Test
- public void testZlibParsing() throws Exception {
+ public void testArParsing() throws Exception {
+
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
- try (InputStream stream = ZipParserTest.class.getResourceAsStream(
- "/test-documents/testTXT.zlib")) {
+ try (InputStream stream = ArParserTest.class.getResourceAsStream(
+ "/test-documents/testARofText.ar")) {
AUTO_DETECT_PARSER.parse(stream, handler, metadata, recursingContext);
}
- assertEquals("application/zlib", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("application/x-archive",
+ metadata.get(Metadata.CONTENT_TYPE));
String content = handler.toString();
+ assertContains("testTXT.txt", content);
assertContains("Test d'indexation de Txt", content);
assertContains("http://www.apache.org", content);
- }
- /**
- * Tests that the ParseContext parser is correctly
- * fired for all the embedded entries.
- */
- @Test
- public void testEmbedded() throws Exception {
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = ZipParserTest.class.getResourceAsStream(
- "/test-documents/testTXT.zlib")) {
- AUTO_DETECT_PARSER.parse(stream, handler, metadata, trackingContext);
+ try (InputStream stream = ArParserTest.class.getResourceAsStream(
+ "/test-documents/testARofSND.ar")) {
+ AUTO_DETECT_PARSER.parse(stream, handler, metadata, recursingContext);
}
-
- // Should have found a single text document inside
- assertEquals(1, tracker.filenames.size());
- assertEquals(1, tracker.mediatypes.size());
- assertEquals(1, tracker.modifiedAts.size());
-
- // Won't have names, dates or types, as zlib doesn't have that
- assertEquals(null, tracker.filenames.get(0));
- assertEquals(null, tracker.mediatypes.get(0));
- assertEquals(null, tracker.createdAts.get(0));
- assertEquals(null, tracker.modifiedAts.get(0));
+
+ assertEquals("application/x-archive",
+ metadata.get(Metadata.CONTENT_TYPE));
+ content = handler.toString();
+ assertContains("testAU.au", content);
}
}
diff --git a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java b/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/pkg/Bzip2ParserTest.java
similarity index 71%
copy from tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java
copy to tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/pkg/Bzip2ParserTest.java
index b85b2e6..3b03eea 100644
--- a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java
+++ b/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/pkg/Bzip2ParserTest.java
@@ -14,18 +14,20 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.tika.parser.pkg;
-
-import static java.nio.charset.StandardCharsets.US_ASCII;
-import static org.junit.Assert.assertEquals;
-
-import java.io.InputStream;
+package org.apache.tika.parser.tests.pkg;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.pkg.AbstractPkgTest;
+import org.apache.tika.parser.pkg.ZipParserTest;
import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
import org.xml.sax.ContentHandler;
+import java.io.InputStream;
+
+import static java.nio.charset.StandardCharsets.US_ASCII;
+import static org.junit.Assert.assertEquals;
+
/**
* Test case for parsing bzip2 files.
*/
@@ -63,32 +65,4 @@ public class Bzip2ParserTest extends AbstractPkgTest {
assertContains("Rida Benjelloun", content);
}
-
- /**
- * Tests that the ParseContext parser is correctly
- * fired for all the embedded entries.
- */
- @Test
- public void testEmbedded() throws Exception {
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = ZipParserTest.class.getResourceAsStream(
- "/test-documents/test-documents.tbz2")) {
- AUTO_DETECT_PARSER.parse(stream, handler, metadata, trackingContext);
- }
-
- // Should find a single entry, for the (compressed) tar file
- assertEquals(1, tracker.filenames.size());
- assertEquals(1, tracker.mediatypes.size());
- assertEquals(1, tracker.modifiedAts.size());
-
- assertEquals(null, tracker.filenames.get(0));
- assertEquals(null, tracker.mediatypes.get(0));
- assertEquals(null, tracker.createdAts.get(0));
- assertEquals(null, tracker.modifiedAts.get(0));
-
- // Tar file starts with the directory name
- assertEquals("test-documents/", new String(tracker.lastSeenStart, 0, 15, US_ASCII));
- }
}
diff --git a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java b/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/pkg/CompositeZipContainerDetectorTest.java
similarity index 92%
rename from tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java
rename to tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/pkg/CompositeZipContainerDetectorTest.java
index 2bf49ab..ebdda9e 100644
--- a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java
+++ b/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/pkg/CompositeZipContainerDetectorTest.java
@@ -15,7 +15,7 @@
* limitations under the License.
*/
-package org.apache.tika.parser.pkg;
+package org.apache.tika.parser.tests.pkg;
import static org.junit.Assert.assertEquals;
@@ -34,8 +34,8 @@ import java.util.Set;
import org.apache.tika.TikaTest;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
-import org.apache.tika.detect.zip.StreamingZipContainerDetector;
-import org.apache.tika.detect.zip.ZipContainerDetector;
+import org.apache.tika.detect.zip.DeprecatedStreamingZipContainerDetector;
+import org.apache.tika.detect.zip.DefaultZipContainerDetector;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
@@ -43,26 +43,27 @@ import org.apache.tika.mime.MediaTypeRegistry;
import org.junit.Ignore;
import org.junit.Test;
-public class ZipContainerDetectorTest extends TikaTest {
+public class CompositeZipContainerDetectorTest extends TikaTest {
private static MediaType ODT_TEXT = MediaType.application("vnd.oasis.opendocument.text");
private static MediaType TIFF = MediaType.image("tiff");
- ZipContainerDetector zipContainerDetector = new ZipContainerDetector();
- StreamingZipContainerDetector streamingZipDetector = new StreamingZipContainerDetector();
-/*
+ DefaultZipContainerDetector compositeZipContainerDetector = new DefaultZipContainerDetector();
+ DeprecatedStreamingZipContainerDetector streamingZipDetector = new DeprecatedStreamingZipContainerDetector();
+
@Test
public void testTiffWorkaround() throws Exception {
//TIKA-2591
Metadata metadata = new Metadata();
try (InputStream is = TikaInputStream.get(getResourceAsStream("/test-documents/testTIFF.tif"))) {
- MediaType mt = zipContainerDetector.detect(is, metadata);
+ MediaType mt = compositeZipContainerDetector.detect(is, metadata);
assertEquals(TIFF, mt);
}
metadata = new Metadata();
try (InputStream is = TikaInputStream.get(getResourceAsStream("/test-documents/testTIFF_multipage.tif"))) {
- MediaType mt = zipContainerDetector.detect(is, metadata);
+ MediaType mt = compositeZipContainerDetector.detect(is, metadata);
assertEquals(TIFF, mt);
}
}
+/* TODO these tests!
@Test
public void testODT() throws Exception {
diff --git a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/CompressParserTest.java b/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/pkg/CompressParserTest.java
similarity index 70%
copy from tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/CompressParserTest.java
copy to tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/pkg/CompressParserTest.java
index a62bbee..76050de 100644
--- a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/CompressParserTest.java
+++ b/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/pkg/CompressParserTest.java
@@ -14,20 +14,23 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.tika.parser.pkg;
-
-import static java.nio.charset.StandardCharsets.US_ASCII;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.fail;
-
-import java.io.InputStream;
+package org.apache.tika.parser.tests.pkg;
import org.apache.tika.exception.TikaMemoryLimitException;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.pkg.AbstractPkgTest;
+import org.apache.tika.parser.pkg.TarParserTest;
+import org.apache.tika.parser.pkg.ZipParserTest;
import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
import org.xml.sax.ContentHandler;
+import java.io.InputStream;
+
+import static java.nio.charset.StandardCharsets.US_ASCII;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
+
/**
* Test case for parsing compress (.Z) files.
*/
@@ -63,31 +66,6 @@ public class CompressParserTest extends AbstractPkgTest {
assertContains("Rida Benjelloun", content);
}
- /**
- * Tests that the ParseContext parser is correctly
- * fired for all the embedded entries.
- */
- @Test
- public void testEmbedded() throws Exception {
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = ZipParserTest.class.getResourceAsStream("/test-documents/test-documents.tar.Z")) {
- AUTO_DETECT_PARSER.parse(stream, handler, metadata, trackingContext);
- }
-
- // Should find a single entry, for the (compressed) tar file
- assertEquals(1, tracker.filenames.size());
- assertEquals(1, tracker.mediatypes.size());
- assertEquals(1, tracker.modifiedAts.size());
-
- assertEquals(null, tracker.filenames.get(0));
- assertEquals(null, tracker.mediatypes.get(0));
- assertEquals(null, tracker.modifiedAts.get(0));
-
- // Tar file starts with the directory name
- assertEquals("test-documents/", new String(tracker.lastSeenStart, 0, 15, US_ASCII));
- }
@Test
public void testLZMAOOM() throws Exception {
@@ -98,14 +76,4 @@ public class CompressParserTest extends AbstractPkgTest {
}
}
- @Test
- public void testCompressOOM() throws Exception {
- try {
- XMLResult r = getXML("testZ_oom.Z");
- fail("should have thrown TikaMemoryLimitException");
- } catch (TikaMemoryLimitException e) {
- }
- }
-
-
}
\ No newline at end of file
diff --git a/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/pkg/CompressorParserTest.java b/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/pkg/CompressorParserTest.java
new file mode 100644
index 0000000..8618f2b
--- /dev/null
+++ b/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/pkg/CompressorParserTest.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.tests.pkg;
+
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
+import org.junit.Test;
+
+import java.util.List;
+
+import static org.junit.Assert.assertEquals;
+
+public class CompressorParserTest extends TikaTest {
+
+
+ @Test
+ public void testLZ4Framed() throws Exception {
+ XMLResult r = getXML("testLZ4-framed.lz4");
+ assertEquals("application/x-lz4", r.metadata.get(Metadata.CONTENT_TYPE));
+ //xml parser throws an exception for test1.xml
+ //for now, be content that the container file is correctly identified
+ assertContains("test1.xml", r.xml);
+ }
+
+ @Test
+ public void testZstd() throws Exception {
+ XMLResult r = getXML("testZSTD.zstd");
+ assertContains("0123456789", r.xml);
+ }
+
+ @Test
+ public void testSnappyFramed() throws Exception {
+ XMLResult r = getXML("testSnappy-framed.sz");
+ assertEquals("application/x-snappy", r.metadata.get(Metadata.CONTENT_TYPE));
+ assertContains("Lorem ipsum dolor sit amet", r.xml);
+ }
+
+ @Test
+ public void testBrotli() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "testBROTLI_compressed.br");
+ List<Metadata> metadataList = getRecursiveMetadata("testBROTLI_compressed.br", metadata);
+
+ assertContains("XXXXXXXXXXYYYYYYYYYY", metadataList.get(1).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
+ assertEquals("testBROTLI_compressed", metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+ }
+}
diff --git a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java b/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/pkg/GzipParserTest.java
similarity index 75%
copy from tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
copy to tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/pkg/GzipParserTest.java
index 91dc8c2..58ca3f9 100644
--- a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
+++ b/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/pkg/GzipParserTest.java
@@ -14,18 +14,20 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.tika.parser.pkg;
-
-import static java.nio.charset.StandardCharsets.US_ASCII;
-import static org.junit.Assert.assertEquals;
-
-import java.io.InputStream;
+package org.apache.tika.parser.tests.pkg;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.pkg.AbstractPkgTest;
+import org.apache.tika.parser.pkg.ZipParserTest;
import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
import org.xml.sax.ContentHandler;
+import java.io.InputStream;
+
+import static java.nio.charset.StandardCharsets.US_ASCII;
+import static org.junit.Assert.assertEquals;
+
/**
* Test case for parsing gzip files.
*/
@@ -62,33 +64,6 @@ public class GzipParserTest extends AbstractPkgTest {
assertContains("test-documents/testXML.xml", content);
assertContains("Rida Benjelloun", content);
}
-
- /**
- * Tests that the ParseContext parser is correctly
- * fired for all the embedded entries.
- */
- @Test
- public void testEmbedded() throws Exception {
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = ZipParserTest.class.getResourceAsStream(
- "/test-documents/test-documents.tgz")) {
- AUTO_DETECT_PARSER.parse(stream, handler, metadata, trackingContext);
- }
-
- // Should find a single entry, for the (compressed) tar file
- assertEquals(1, tracker.filenames.size());
- assertEquals(1, tracker.mediatypes.size());
- assertEquals(1, tracker.modifiedAts.size());
-
- assertEquals(null, tracker.filenames.get(0));
- assertEquals(null, tracker.mediatypes.get(0));
- assertEquals(null, tracker.modifiedAts.get(0));
-
- // Tar file starts with the directory name
- assertEquals("test-documents/", new String(tracker.lastSeenStart, 0, 15, US_ASCII));
- }
@Test
public void testSvgzParsing() throws Exception {
diff --git a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java b/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/pkg/RarParserTest.java
similarity index 60%
copy from tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java
copy to tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/pkg/RarParserTest.java
index b85b2e6..66facd0 100644
--- a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java
+++ b/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/pkg/RarParserTest.java
@@ -14,34 +14,35 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.tika.parser.pkg;
-
-import static java.nio.charset.StandardCharsets.US_ASCII;
-import static org.junit.Assert.assertEquals;
-
-import java.io.InputStream;
+package org.apache.tika.parser.tests.pkg;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.pkg.AbstractPkgTest;
import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
import org.xml.sax.ContentHandler;
+import java.io.InputStream;
+
+import static org.junit.Assert.assertEquals;
+
+
/**
- * Test case for parsing bzip2 files.
+ * Test case for parsing rar files.
*/
-public class Bzip2ParserTest extends AbstractPkgTest {
+public class RarParserTest extends AbstractPkgTest {
@Test
- public void testBzip2Parsing() throws Exception {
+ public void testRarParsing() throws Exception {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
- try (InputStream stream = Bzip2ParserTest.class.getResourceAsStream(
- "/test-documents/test-documents.tbz2")) {
+ try (InputStream stream = RarParserTest.class.getResourceAsStream(
+ "/test-documents/test-documents.rar")) {
AUTO_DETECT_PARSER.parse(stream, handler, metadata, recursingContext);
}
- assertEquals("application/x-bzip2", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("application/x-rar-compressed; version=4", metadata.get(Metadata.CONTENT_TYPE));
String content = handler.toString();
assertContains("test-documents/testEXCEL.xls", content);
assertContains("Sample Excel Worksheet", content);
@@ -63,32 +64,4 @@ public class Bzip2ParserTest extends AbstractPkgTest {
assertContains("Rida Benjelloun", content);
}
-
- /**
- * Tests that the ParseContext parser is correctly
- * fired for all the embedded entries.
- */
- @Test
- public void testEmbedded() throws Exception {
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = ZipParserTest.class.getResourceAsStream(
- "/test-documents/test-documents.tbz2")) {
- AUTO_DETECT_PARSER.parse(stream, handler, metadata, trackingContext);
- }
-
- // Should find a single entry, for the (compressed) tar file
- assertEquals(1, tracker.filenames.size());
- assertEquals(1, tracker.mediatypes.size());
- assertEquals(1, tracker.modifiedAts.size());
-
- assertEquals(null, tracker.filenames.get(0));
- assertEquals(null, tracker.mediatypes.get(0));
- assertEquals(null, tracker.createdAts.get(0));
- assertEquals(null, tracker.modifiedAts.get(0));
-
- // Tar file starts with the directory name
- assertEquals("test-documents/", new String(tracker.lastSeenStart, 0, 15, US_ASCII));
- }
-}
+}
\ No newline at end of file
diff --git a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java b/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/pkg/Seven7ParserTest.java
similarity index 80%
copy from tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java
copy to tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/pkg/Seven7ParserTest.java
index 2fc6841..829f7ee 100644
--- a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java
+++ b/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/pkg/Seven7ParserTest.java
@@ -14,28 +14,26 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.tika.parser.pkg;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertNull;
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
-
-import javax.crypto.Cipher;
-
-import java.io.InputStream;
-import java.security.NoSuchAlgorithmException;
+package org.apache.tika.parser.tests.pkg;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.parser.pkg.AbstractPkgTest;
import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
import org.xml.sax.ContentHandler;
+import javax.crypto.Cipher;
+import java.io.InputStream;
+import java.security.NoSuchAlgorithmException;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
/**
* Test case for parsing 7z files.
*/
@@ -74,46 +72,6 @@ public class Seven7ParserTest extends AbstractPkgTest {
assertContains("Rida Benjelloun", content);
}
- /**
- * Tests that the ParseContext parser is correctly
- * fired for all the embedded entries.
- */
- @Test
- public void testEmbedded() throws Exception {
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = Seven7ParserTest.class.getResourceAsStream(
- "/test-documents/test-documents.7z")) {
- AUTO_DETECT_PARSER.parse(stream, handler, metadata, trackingContext);
- }
-
- // Should have found all 9 documents, but not the directory
- assertEquals(9, tracker.filenames.size());
- assertEquals(9, tracker.mediatypes.size());
- assertEquals(9, tracker.modifiedAts.size());
-
- // Should have names but not content types, as 7z doesn't
- // store the content types
- assertEquals("test-documents/testEXCEL.xls", tracker.filenames.get(0));
- assertEquals("test-documents/testHTML.html", tracker.filenames.get(1));
- assertEquals("test-documents/testOpenOffice2.odt", tracker.filenames.get(2));
- assertEquals("test-documents/testPDF.pdf", tracker.filenames.get(3));
- assertEquals("test-documents/testPPT.ppt", tracker.filenames.get(4));
- assertEquals("test-documents/testRTF.rtf", tracker.filenames.get(5));
- assertEquals("test-documents/testTXT.txt", tracker.filenames.get(6));
- assertEquals("test-documents/testWORD.doc", tracker.filenames.get(7));
- assertEquals("test-documents/testXML.xml", tracker.filenames.get(8));
-
- for(String type : tracker.mediatypes) {
- assertNull(type);
- }
- for(String mod : tracker.modifiedAts) {
- assertNotNull(mod);
- assertTrue("Modified at " + mod, mod.startsWith("20"));
- }
- }
-
@Test
public void testPasswordProtected() throws Exception {
ContentHandler handler = new BodyContentHandler();
diff --git a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java b/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/pkg/TarParserTest.java
similarity index 60%
copy from tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java
copy to tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/pkg/TarParserTest.java
index b85b2e6..c523f11 100644
--- a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java
+++ b/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/pkg/TarParserTest.java
@@ -14,34 +14,38 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.tika.parser.pkg;
-
-import static java.nio.charset.StandardCharsets.US_ASCII;
-import static org.junit.Assert.assertEquals;
-
-import java.io.InputStream;
+package org.apache.tika.parser.tests.pkg;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.pkg.AbstractPkgTest;
+import org.apache.tika.parser.pkg.ZipParserTest;
import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
import org.xml.sax.ContentHandler;
+import java.io.InputStream;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
/**
- * Test case for parsing bzip2 files.
+ * Test case for parsing tar files.
*/
-public class Bzip2ParserTest extends AbstractPkgTest {
+public class TarParserTest extends AbstractPkgTest {
@Test
- public void testBzip2Parsing() throws Exception {
+ public void testTarParsing() throws Exception {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
- try (InputStream stream = Bzip2ParserTest.class.getResourceAsStream(
- "/test-documents/test-documents.tbz2")) {
+ try (InputStream stream = TarParserTest.class.getResourceAsStream(
+ "/test-documents/test-documents.tar")) {
AUTO_DETECT_PARSER.parse(stream, handler, metadata, recursingContext);
}
- assertEquals("application/x-bzip2", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("application/x-gtar", metadata.get(Metadata.CONTENT_TYPE));
String content = handler.toString();
assertContains("test-documents/testEXCEL.xls", content);
assertContains("Sample Excel Worksheet", content);
@@ -62,33 +66,4 @@ public class Bzip2ParserTest extends AbstractPkgTest {
assertContains("test-documents/testXML.xml", content);
assertContains("Rida Benjelloun", content);
}
-
-
- /**
- * Tests that the ParseContext parser is correctly
- * fired for all the embedded entries.
- */
- @Test
- public void testEmbedded() throws Exception {
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = ZipParserTest.class.getResourceAsStream(
- "/test-documents/test-documents.tbz2")) {
- AUTO_DETECT_PARSER.parse(stream, handler, metadata, trackingContext);
- }
-
- // Should find a single entry, for the (compressed) tar file
- assertEquals(1, tracker.filenames.size());
- assertEquals(1, tracker.mediatypes.size());
- assertEquals(1, tracker.modifiedAts.size());
-
- assertEquals(null, tracker.filenames.get(0));
- assertEquals(null, tracker.mediatypes.get(0));
- assertEquals(null, tracker.createdAts.get(0));
- assertEquals(null, tracker.modifiedAts.get(0));
-
- // Tar file starts with the directory name
- assertEquals("test-documents/", new String(tracker.lastSeenStart, 0, 15, US_ASCII));
- }
}
diff --git a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java b/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/pkg/ZipParserTest.java
similarity index 59%
copy from tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
copy to tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/pkg/ZipParserTest.java
index 25fcfb1..a3ca42c 100644
--- a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
+++ b/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/pkg/ZipParserTest.java
@@ -14,17 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.tika.parser.pkg;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertNull;
-import static org.junit.Assert.assertTrue;
-
-import java.io.InputStream;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
+package org.apache.tika.parser.tests.pkg;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.compress.archivers.ArchiveStreamFactory;
@@ -36,12 +26,23 @@ import org.apache.tika.metadata.HttpHeaders;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.pkg.AbstractPkgTest;
import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
import org.xml.sax.ContentHandler;
import org.xml.sax.helpers.DefaultHandler;
+import java.io.InputStream;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
/**
* Test case for parsing zip files.
*/
@@ -79,67 +80,9 @@ public class ZipParserTest extends AbstractPkgTest {
assertContains("Rida Benjelloun", content);
}
- /**
- * Tests that the ParseContext parser is correctly
- * fired for all the embedded entries.
- */
- @Test
- public void testEmbedded() throws Exception {
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = ZipParserTest.class.getResourceAsStream(
- "/test-documents/test-documents.zip")) {
- AUTO_DETECT_PARSER.parse(stream, handler, metadata, trackingContext);
- }
-
- // Should have found all 9 documents
- assertEquals(9, tracker.filenames.size());
- assertEquals(9, tracker.mediatypes.size());
- assertEquals(9, tracker.modifiedAts.size());
-
- // Should have names and modified dates, but not content types,
- // as zip doesn't store the content types
- assertEquals("testEXCEL.xls", tracker.filenames.get(0));
- assertEquals("testHTML.html", tracker.filenames.get(1));
- assertEquals("testOpenOffice2.odt", tracker.filenames.get(2));
- assertEquals("testPDF.pdf", tracker.filenames.get(3));
- assertEquals("testPPT.ppt", tracker.filenames.get(4));
- assertEquals("testRTF.rtf", tracker.filenames.get(5));
- assertEquals("testTXT.txt", tracker.filenames.get(6));
- assertEquals("testWORD.doc", tracker.filenames.get(7));
- assertEquals("testXML.xml", tracker.filenames.get(8));
-
- for(String type : tracker.mediatypes) {
- assertNull(type);
- }
- for(String crt : tracker.createdAts) {
- assertNull(crt);
- }
- for(String mod : tracker.modifiedAts) {
- assertNotNull(mod);
- assertTrue("Modified at " + mod, mod.startsWith("20"));
- }
- }
-
- /**
- * Test case for the ability of the ZIP parser to extract the name of
- * a ZIP entry even if the content of the entry is unreadable due to an
- * unsupported compression method.
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-346">TIKA-346</a>
- */
- @Test
- public void testUnsupportedZipCompressionMethod() throws Exception {
- String content = new Tika().parseToString(
- ZipParserTest.class.getResourceAsStream(
- "/test-documents/moby.zip"));
- assertContains("README", content);
- }
-
private class GatherRelIDsDocumentExtractor implements EmbeddedDocumentExtractor {
public Set<String> allRelIDs = new HashSet<String>();
- public boolean shouldParseEmbedded(Metadata metadata) {
+ public boolean shouldParseEmbedded(Metadata metadata) {
String relID = metadata.get(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID);
if (relID != null) {
allRelIDs.add(relID);
@@ -175,27 +118,6 @@ public class ZipParserTest extends AbstractPkgTest {
assertTrue(relIDs.allRelIDs.contains("test2.txt"));
}
- @Test // TIKA-936
- public void testCustomEncoding() throws Exception {
- ArchiveStreamFactory factory = new ArchiveStreamFactory();
- factory.setEntryEncoding("SJIS");
- trackingContext.set(ArchiveStreamFactory.class, factory);
-
- try (InputStream stream = TikaInputStream.get(Base64.decodeBase64(
- "UEsDBBQAAAAIAI+CvUCDo3+zIgAAACgAAAAOAAAAk/qWe4zqg4GDgi50"
- + "eHRr2tj0qulsc2pzRHN609Gm7Y1OvFxNYLHJv6ZV97yCiQEAUEsBAh"
- + "QLFAAAAAgAj4K9QIOjf7MiAAAAKAAAAA4AAAAAAAAAAAAgAAAAAAAA"
- + "AJP6lnuM6oOBg4IudHh0UEsFBgAAAAABAAEAPAAAAE4AAAAAAA=="))) {
- AUTO_DETECT_PARSER.parse(
- stream, new DefaultHandler(),
- new Metadata(), trackingContext);
- }
-
- assertEquals(1, tracker.filenames.size());
- assertEquals(
- "\u65E5\u672C\u8A9E\u30E1\u30E2.txt",
- tracker.filenames.get(0));
- }
@Test
public void testZipEncrypted() throws Exception {
@@ -211,18 +133,6 @@ public class ZipParserTest extends AbstractPkgTest {
}
@Test
- public void testKMZDetection() throws Exception {
- List<Metadata> metadataList = getRecursiveMetadata("testKMZ.kmz");
- assertEquals("application/vnd.google-earth.kmz", metadataList.get(0).get(HttpHeaders.CONTENT_TYPE));
- }
-
- @Test
- public void testJARDetection() throws Exception {
- List<Metadata> metadataList = getRecursiveMetadata("testJAR.jar");
- assertEquals("application/java-archive", metadataList.get(0).get(HttpHeaders.CONTENT_TYPE));
- }
-
- @Test
public void testQuineRecursiveParserWrapper() throws Exception {
//received permission from author via dm
//2019-07-25 to include
@@ -231,10 +141,4 @@ public class ZipParserTest extends AbstractPkgTest {
//the original file name
getRecursiveMetadata("droste.zip");
}
-
- @Test(expected = TikaException.class)
- public void testQuine() throws Exception {
- getXML("droste.zip");
- }
-
}
diff --git a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZlibParserTest.java b/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/pkg/ZlibParserTest.java
similarity index 62%
copy from tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZlibParserTest.java
copy to tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/pkg/ZlibParserTest.java
index 5d2faaf..f7cd7a3 100644
--- a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZlibParserTest.java
+++ b/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/pkg/ZlibParserTest.java
@@ -14,17 +14,18 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.tika.parser.pkg;
-
-import static org.junit.Assert.assertEquals;
-
-import java.io.InputStream;
+package org.apache.tika.parser.tests.pkg;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.pkg.AbstractPkgTest;
import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
import org.xml.sax.ContentHandler;
+import java.io.InputStream;
+
+import static org.junit.Assert.assertEquals;
+
/**
* Test case for parsing zlib compressed
*/
@@ -44,30 +45,4 @@ public class ZlibParserTest extends AbstractPkgTest {
assertContains("Test d'indexation de Txt", content);
assertContains("http://www.apache.org", content);
}
-
- /**
- * Tests that the ParseContext parser is correctly
- * fired for all the embedded entries.
- */
- @Test
- public void testEmbedded() throws Exception {
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = ZipParserTest.class.getResourceAsStream(
- "/test-documents/testTXT.zlib")) {
- AUTO_DETECT_PARSER.parse(stream, handler, metadata, trackingContext);
- }
-
- // Should have found a single text document inside
- assertEquals(1, tracker.filenames.size());
- assertEquals(1, tracker.mediatypes.size());
- assertEquals(1, tracker.modifiedAts.size());
-
- // Won't have names, dates or types, as zlib doesn't have that
- assertEquals(null, tracker.filenames.get(0));
- assertEquals(null, tracker.mediatypes.get(0));
- assertEquals(null, tracker.createdAts.get(0));
- assertEquals(null, tracker.modifiedAts.get(0));
- }
}
diff --git a/tika-parser-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/ooxml/OPCPackageDetector.java b/tika-parser-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/ooxml/OPCPackageDetector.java
index b27155d..7b709bf 100644
--- a/tika-parser-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/ooxml/OPCPackageDetector.java
+++ b/tika-parser-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/ooxml/OPCPackageDetector.java
@@ -8,7 +8,7 @@ import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
import org.apache.poi.openxml4j.util.ZipEntrySource;
import org.apache.poi.openxml4j.util.ZipFileZipEntrySource;
-import org.apache.tika.detect.zip.ZipDetector;
+import org.apache.tika.detect.zip.ZipContainerDetector;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.mime.MediaType;
@@ -16,7 +16,7 @@ import java.io.IOException;
import java.util.Locale;
import java.util.regex.Pattern;
-public class OPCPackageDetector implements ZipDetector {
+public class OPCPackageDetector implements ZipContainerDetector {
private static final Pattern MACRO_TEMPLATE_PATTERN = Pattern.compile("macroenabledtemplate$", Pattern.CASE_INSENSITIVE);
diff --git a/tika-parser-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.detect.zip.ZipDetector b/tika-parser-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.detect.zip.ZipContainerDetector
similarity index 100%
rename from tika-parser-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.detect.zip.ZipDetector
rename to tika-parser-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.detect.zip.ZipContainerDetector
diff --git a/tika-parser-modules/tika-parser-pkg-module/pom.xml b/tika-parser-modules/tika-parser-pkg-module/pom.xml
index 43b3a57..40ada9b 100644
--- a/tika-parser-modules/tika-parser-pkg-module/pom.xml
+++ b/tika-parser-modules/tika-parser-pkg-module/pom.xml
@@ -45,14 +45,6 @@
<artifactId>junrar</artifactId>
<version>4.0.0</version>
<exclusions>
- <exclusion>
- <groupId>commons-logging</groupId>
- <artifactId>commons-logging</artifactId>
- </exclusion>
- <exclusion>
- <groupId>commons-logging</groupId>
- <artifactId>commons-logging-api</artifactId>
- </exclusion>
<!-- TIKA-2504 exclude to avoid vulnerability in plexus-utils -->
<exclusion>
<groupId>org.apache.commons</groupId>
diff --git a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ArParserTest.java b/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ArParserTest.java
index c29fb60..579540c 100644
--- a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ArParserTest.java
+++ b/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ArParserTest.java
@@ -29,34 +29,7 @@ import org.junit.Test;
import org.xml.sax.ContentHandler;
public class ArParserTest extends AbstractPkgTest {
- @Test
- public void testArParsing() throws Exception {
-
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = ArParserTest.class.getResourceAsStream(
- "/test-documents/testARofText.ar")) {
- AUTO_DETECT_PARSER.parse(stream, handler, metadata, recursingContext);
- }
-
- assertEquals("application/x-archive",
- metadata.get(Metadata.CONTENT_TYPE));
- String content = handler.toString();
- assertContains("testTXT.txt", content);
- assertContains("Test d'indexation de Txt", content);
- assertContains("http://www.apache.org", content);
- try (InputStream stream = ArParserTest.class.getResourceAsStream(
- "/test-documents/testARofSND.ar")) {
- AUTO_DETECT_PARSER.parse(stream, handler, metadata, recursingContext);
- }
-
- assertEquals("application/x-archive",
- metadata.get(Metadata.CONTENT_TYPE));
- content = handler.toString();
- assertContains("testAU.au", content);
- }
/**
* Tests that the ParseContext parser is correctly fired for all the
diff --git a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java b/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java
index b85b2e6..14da361 100644
--- a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java
+++ b/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java
@@ -31,39 +31,6 @@ import org.xml.sax.ContentHandler;
*/
public class Bzip2ParserTest extends AbstractPkgTest {
- @Test
- public void testBzip2Parsing() throws Exception {
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = Bzip2ParserTest.class.getResourceAsStream(
- "/test-documents/test-documents.tbz2")) {
- AUTO_DETECT_PARSER.parse(stream, handler, metadata, recursingContext);
- }
-
- assertEquals("application/x-bzip2", metadata.get(Metadata.CONTENT_TYPE));
- String content = handler.toString();
- assertContains("test-documents/testEXCEL.xls", content);
- assertContains("Sample Excel Worksheet", content);
- assertContains("test-documents/testHTML.html", content);
- assertContains("Test Indexation Html", content);
- assertContains("test-documents/testOpenOffice2.odt", content);
- assertContains("This is a sample Open Office document", content);
- assertContains("test-documents/testPDF.pdf", content);
- assertContains("Apache Tika", content);
- assertContains("test-documents/testPPT.ppt", content);
- assertContains("Sample Powerpoint Slide", content);
- assertContains("test-documents/testRTF.rtf", content);
- assertContains("indexation Word", content);
- assertContains("test-documents/testTXT.txt", content);
- assertContains("Test d'indexation de Txt", content);
- assertContains("test-documents/testWORD.doc", content);
- assertContains("This is a sample Microsoft Word Document", content);
- assertContains("test-documents/testXML.xml", content);
- assertContains("Rida Benjelloun", content);
- }
-
-
/**
* Tests that the ParseContext parser is correctly
* fired for all the embedded entries.
diff --git a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/CompressParserTest.java b/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/CompressParserTest.java
index a62bbee..04aba66 100644
--- a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/CompressParserTest.java
+++ b/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/CompressParserTest.java
@@ -32,36 +32,6 @@ import org.xml.sax.ContentHandler;
* Test case for parsing compress (.Z) files.
*/
public class CompressParserTest extends AbstractPkgTest {
- @Test
- public void testCompressParsing() throws Exception {
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = TarParserTest.class.getResourceAsStream("/test-documents/test-documents.tar.Z")) {
- AUTO_DETECT_PARSER.parse(stream, handler, metadata, recursingContext);
- }
-
- assertEquals("application/x-compress", metadata.get(Metadata.CONTENT_TYPE));
- String content = handler.toString();
- assertContains("test-documents/testEXCEL.xls", content);
- assertContains("Sample Excel Worksheet", content);
- assertContains("test-documents/testHTML.html", content);
- assertContains("Test Indexation Html", content);
- assertContains("test-documents/testOpenOffice2.odt", content);
- assertContains("This is a sample Open Office document", content);
- assertContains("test-documents/testPDF.pdf", content);
- assertContains("Apache Tika", content);
- assertContains("test-documents/testPPT.ppt", content);
- assertContains("Sample Powerpoint Slide", content);
- assertContains("test-documents/testRTF.rtf", content);
- assertContains("indexation Word", content);
- assertContains("test-documents/testTXT.txt", content);
- assertContains("Test d'indexation de Txt", content);
- assertContains("test-documents/testWORD.doc", content);
- assertContains("This is a sample Microsoft Word Document", content);
- assertContains("test-documents/testXML.xml", content);
- assertContains("Rida Benjelloun", content);
- }
/**
* Tests that the ParseContext parser is correctly
@@ -90,15 +60,6 @@ public class CompressParserTest extends AbstractPkgTest {
}
@Test
- public void testLZMAOOM() throws Exception {
- try {
- XMLResult r = getXML("testLZMA_oom");
- fail("should have thrown TikaMemoryLimitException");
- } catch (TikaMemoryLimitException e) {
- }
- }
-
- @Test
public void testCompressOOM() throws Exception {
try {
XMLResult r = getXML("testZ_oom.Z");
diff --git a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java b/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java
index 8826c50..5e221f0 100644
--- a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java
+++ b/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java
@@ -59,38 +59,6 @@ public class CompressorParserTest extends TikaTest {
}
@Test
- public void testSnappyFramed() throws Exception {
- XMLResult r = getXML("testSnappy-framed.sz");
- assertEquals("application/x-snappy", r.metadata.get(Metadata.CONTENT_TYPE));
- assertContains("Lorem ipsum dolor sit amet", r.xml);
- }
-
- @Test
- public void testLZ4Framed() throws Exception {
- XMLResult r = getXML("testLZ4-framed.lz4");
- assertEquals("application/x-lz4", r.metadata.get(Metadata.CONTENT_TYPE));
- //xml parser throws an exception for test1.xml
- //for now, be content that the container file is correctly identified
- assertContains("test1.xml", r.xml);
- }
-
- @Test
- public void testZstd() throws Exception {
- XMLResult r = getXML("testZSTD.zstd");
- assertContains("0123456789", r.xml);
- }
-
- @Test
- public void testBrotli() throws Exception {
- Metadata metadata = new Metadata();
- metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "testBROTLI_compressed.br");
- List<Metadata> metadataList = getRecursiveMetadata("testBROTLI_compressed.br", metadata);
-
- assertContains("XXXXXXXXXXYYYYYYYYYY", metadataList.get(1).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
- assertEquals("testBROTLI_compressed", metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
- }
-
- @Test
public void testCoverage() throws Exception {
//test that the package parser covers all inputstreams handled
//by CompressorStreamFactory. When we update commons-compress, and they add
diff --git a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java b/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
index 91dc8c2..c8ace4e 100644
--- a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
+++ b/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
@@ -31,38 +31,6 @@ import org.xml.sax.ContentHandler;
*/
public class GzipParserTest extends AbstractPkgTest {
- @Test
- public void testGzipParsing() throws Exception {
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = GzipParserTest.class.getResourceAsStream(
- "/test-documents/test-documents.tgz")) {
- AUTO_DETECT_PARSER.parse(stream, handler, metadata, recursingContext);
- }
-
- assertEquals("application/gzip", metadata.get(Metadata.CONTENT_TYPE));
- String content = handler.toString();
- assertContains("test-documents/testEXCEL.xls", content);
- assertContains("Sample Excel Worksheet", content);
- assertContains("test-documents/testHTML.html", content);
- assertContains("Test Indexation Html", content);
- assertContains("test-documents/testOpenOffice2.odt", content);
- assertContains("This is a sample Open Office document", content);
- assertContains("test-documents/testPDF.pdf", content);
- assertContains("Apache Tika", content);
- assertContains("test-documents/testPPT.ppt", content);
- assertContains("Sample Powerpoint Slide", content);
- assertContains("test-documents/testRTF.rtf", content);
- assertContains("indexation Word", content);
- assertContains("test-documents/testTXT.txt", content);
- assertContains("Test d'indexation de Txt", content);
- assertContains("test-documents/testWORD.doc", content);
- assertContains("This is a sample Microsoft Word Document", content);
- assertContains("test-documents/testXML.xml", content);
- assertContains("Rida Benjelloun", content);
- }
-
/**
* Tests that the ParseContext parser is correctly
* fired for all the embedded entries.
@@ -76,12 +44,12 @@ public class GzipParserTest extends AbstractPkgTest {
"/test-documents/test-documents.tgz")) {
AUTO_DETECT_PARSER.parse(stream, handler, metadata, trackingContext);
}
-
+
// Should find a single entry, for the (compressed) tar file
assertEquals(1, tracker.filenames.size());
assertEquals(1, tracker.mediatypes.size());
assertEquals(1, tracker.modifiedAts.size());
-
+
assertEquals(null, tracker.filenames.get(0));
assertEquals(null, tracker.mediatypes.get(0));
assertEquals(null, tracker.modifiedAts.get(0));
@@ -89,20 +57,4 @@ public class GzipParserTest extends AbstractPkgTest {
// Tar file starts with the directory name
assertEquals("test-documents/", new String(tracker.lastSeenStart, 0, 15, US_ASCII));
}
-
- @Test
- public void testSvgzParsing() throws Exception {
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = GzipParserTest.class.getResourceAsStream(
- "/test-documents/testSVG.svgz")) {
- AUTO_DETECT_PARSER.parse(stream, handler, metadata, recursingContext);
- }
-
- assertEquals("application/gzip", metadata.get(Metadata.CONTENT_TYPE));
- String content = handler.toString();
- assertContains("Test SVG image", content);
- }
-
}
diff --git a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/RarParserTest.java b/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/RarParserTest.java
index d6f5af1..340679c 100644
--- a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/RarParserTest.java
+++ b/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/RarParserTest.java
@@ -38,38 +38,6 @@ import org.xml.sax.ContentHandler;
*/
public class RarParserTest extends AbstractPkgTest {
- @Test
- public void testRarParsing() throws Exception {
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = RarParserTest.class.getResourceAsStream(
- "/test-documents/test-documents.rar")) {
- AUTO_DETECT_PARSER.parse(stream, handler, metadata, recursingContext);
- }
-
- assertEquals("application/x-rar-compressed; version=4", metadata.get(Metadata.CONTENT_TYPE));
- String content = handler.toString();
- assertContains("test-documents/testEXCEL.xls", content);
- assertContains("Sample Excel Worksheet", content);
- assertContains("test-documents/testHTML.html", content);
- assertContains("Test Indexation Html", content);
- assertContains("test-documents/testOpenOffice2.odt", content);
- assertContains("This is a sample Open Office document", content);
- assertContains("test-documents/testPDF.pdf", content);
- assertContains("Apache Tika", content);
- assertContains("test-documents/testPPT.ppt", content);
- assertContains("Sample Powerpoint Slide", content);
- assertContains("test-documents/testRTF.rtf", content);
- assertContains("indexation Word", content);
- assertContains("test-documents/testTXT.txt", content);
- assertContains("Test d'indexation de Txt", content);
- assertContains("test-documents/testWORD.doc", content);
- assertContains("This is a sample Microsoft Word Document", content);
- assertContains("test-documents/testXML.xml", content);
- assertContains("Rida Benjelloun", content);
- }
-
/**
* Tests that the ParseContext parser is correctly
* fired for all the embedded entries.
diff --git a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java b/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java
index 2fc6841..e893092 100644
--- a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java
+++ b/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java
@@ -40,39 +40,6 @@ import org.xml.sax.ContentHandler;
* Test case for parsing 7z files.
*/
public class Seven7ParserTest extends AbstractPkgTest {
- private static final MediaType TYPE_7ZIP = MediaType.application("x-7z-compressed");
-
- @Test
- public void test7ZParsing() throws Exception {
- Metadata metadata = new Metadata();
-
- // Ensure 7zip is a parsable format
- assertTrue("No 7zip parser found",
- AUTO_DETECT_PARSER.getSupportedTypes(recursingContext).contains(TYPE_7ZIP));
-
- // Parse
- String content = getText("test-documents.7z", metadata);
-
- assertEquals(TYPE_7ZIP.toString(), metadata.get(Metadata.CONTENT_TYPE));
- assertContains("test-documents/testEXCEL.xls", content);
- assertContains("Sample Excel Worksheet", content);
- assertContains("test-documents/testHTML.html", content);
- assertContains("Test Indexation Html", content);
- assertContains("test-documents/testOpenOffice2.odt", content);
- assertContains("This is a sample Open Office document", content);
- assertContains("test-documents/testPDF.pdf", content);
- assertContains("Apache Tika", content);
- assertContains("test-documents/testPPT.ppt", content);
- assertContains("Sample Powerpoint Slide", content);
- assertContains("test-documents/testRTF.rtf", content);
- assertContains("indexation Word", content);
- assertContains("test-documents/testTXT.txt", content);
- assertContains("Test d'indexation de Txt", content);
- assertContains("test-documents/testWORD.doc", content);
- assertContains("This is a sample Microsoft Word Document", content);
- assertContains("test-documents/testXML.xml", content);
- assertContains("Rida Benjelloun", content);
- }
/**
* Tests that the ParseContext parser is correctly
@@ -113,112 +80,4 @@ public class Seven7ParserTest extends AbstractPkgTest {
assertTrue("Modified at " + mod, mod.startsWith("20"));
}
}
-
- @Test
- public void testPasswordProtected() throws Exception {
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- // No password, will fail with EncryptedDocumentException
- boolean ex = false;
- try (InputStream stream = Seven7ParserTest.class.getResourceAsStream(
- "/test-documents/test7Z_protected_passTika.7z")) {
- AUTO_DETECT_PARSER.parse(stream, handler, metadata, recursingContext);
- fail("Shouldn't be able to read a password protected 7z without the password");
- } catch (EncryptedDocumentException e) {
- // Good
- ex = true;
- }
-
- assertTrue("test no password", ex);
-
- // No password, will fail with EncryptedDocumentException
- ex = false;
- try (InputStream stream = Seven7ParserTest.class.getResourceAsStream(
- "/test-documents/full_encrypted.7z")) {
- AUTO_DETECT_PARSER.parse(stream, handler, metadata, recursingContext);
- fail("Shouldn't be able to read a full password protected 7z without the password");
- } catch (EncryptedDocumentException e) {
- // Good
- ex = true;
- } catch (Exception e){
- ex = false;
- }
-
- assertTrue("test no password for full encrypted 7z", ex);
-
- ex = false;
-
- // Wrong password currently silently gives no content
- // Ideally we'd like Commons Compress to give an error, but it doesn't...
- recursingContext.set(PasswordProvider.class, new PasswordProvider() {
- @Override
- public String getPassword(Metadata metadata) {
- return "wrong";
- }
- });
- handler = new BodyContentHandler();
- try (InputStream stream = Seven7ParserTest.class.getResourceAsStream(
- "/test-documents/test7Z_protected_passTika.7z")) {
- AUTO_DETECT_PARSER.parse(stream, handler, metadata, recursingContext);
- fail("Shouldn't be able to read a password protected 7z with wrong password");
- } catch (TikaException e) {
- //if JCE is installed, the cause will be: Caused by: org.tukaani.xz.CorruptedInputException: Compressed data is corrupt
- //if JCE is not installed, the message will include
- // "(do you have the JCE Unlimited Strength Jurisdiction Policy Files installed?")
- ex = true;
- }
- assertTrue("TikaException for bad password", ex);
- // Will be empty
- assertEquals("", handler.toString());
-
- ex = false;
- // Right password works fine if JCE Unlimited Strength has been installed!!!
- if (isStrongCryptoAvailable()) {
- recursingContext.set(PasswordProvider.class, new PasswordProvider() {
- @Override
- public String getPassword(Metadata metadata) {
- return "Tika";
- }
- });
- handler = new BodyContentHandler();
- try (InputStream stream = Seven7ParserTest.class.getResourceAsStream(
- "/test-documents/test7Z_protected_passTika.7z")) {
- AUTO_DETECT_PARSER.parse(stream, handler, metadata, recursingContext);
- }
-
- assertEquals(TYPE_7ZIP.toString(), metadata.get(Metadata.CONTENT_TYPE));
- String content = handler.toString();
-
- // Should get filename
- assertContains("text.txt", content);
-
- // Should get contents from the text file in the 7z file
- assertContains("TEST DATA FOR TIKA.", content);
- assertContains("This is text inside an encrypted 7zip (7z) file.", content);
- assertContains("It should be processed by Tika just fine!", content);
- assertContains("TIKA-1521", content);
- } else {
- //if jce is not installed, test for IOException wrapped in TikaException
- boolean ioe = false;
- recursingContext.set(PasswordProvider.class, new PasswordProvider() {
- @Override
- public String getPassword(Metadata metadata) {
- return "Tika";
- }
- });
- handler = new BodyContentHandler();
- try (InputStream stream = Seven7ParserTest.class.getResourceAsStream(
- "/test-documents/test7Z_protected_passTika.7z")) {
- AUTO_DETECT_PARSER.parse(stream, handler, metadata, recursingContext);
- } catch (TikaException e) {
- ioe = true;
- }
- assertTrue("IOException because JCE was not installed", ioe);
- }
- }
-
- private static boolean isStrongCryptoAvailable() throws NoSuchAlgorithmException {
- return Cipher.getMaxAllowedKeyLength("AES/ECB/PKCS5Padding") >= 256;
- }
}
diff --git a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java b/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java
index abdd3f4..286538d 100644
--- a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java
+++ b/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java
@@ -33,38 +33,6 @@ import org.xml.sax.ContentHandler;
*/
public class TarParserTest extends AbstractPkgTest {
- @Test
- public void testTarParsing() throws Exception {
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = TarParserTest.class.getResourceAsStream(
- "/test-documents/test-documents.tar")) {
- AUTO_DETECT_PARSER.parse(stream, handler, metadata, recursingContext);
- }
-
- assertEquals("application/x-gtar", metadata.get(Metadata.CONTENT_TYPE));
- String content = handler.toString();
- assertContains("test-documents/testEXCEL.xls", content);
- assertContains("Sample Excel Worksheet", content);
- assertContains("test-documents/testHTML.html", content);
- assertContains("Test Indexation Html", content);
- assertContains("test-documents/testOpenOffice2.odt", content);
- assertContains("This is a sample Open Office document", content);
- assertContains("test-documents/testPDF.pdf", content);
- assertContains("Apache Tika", content);
- assertContains("test-documents/testPPT.ppt", content);
- assertContains("Sample Powerpoint Slide", content);
- assertContains("test-documents/testRTF.rtf", content);
- assertContains("indexation Word", content);
- assertContains("test-documents/testTXT.txt", content);
- assertContains("Test d'indexation de Txt", content);
- assertContains("test-documents/testWORD.doc", content);
- assertContains("This is a sample Microsoft Word Document", content);
- assertContains("test-documents/testXML.xml", content);
- assertContains("Rida Benjelloun", content);
- }
-
/**
* Tests that the ParseContext parser is correctly
* fired for all the embedded entries.
diff --git a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java b/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
index 25fcfb1..32ac389 100644
--- a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
+++ b/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
@@ -47,38 +47,6 @@ import org.xml.sax.helpers.DefaultHandler;
*/
public class ZipParserTest extends AbstractPkgTest {
- @Test
- public void testZipParsing() throws Exception {
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = ZipParserTest.class.getResourceAsStream(
- "/test-documents/test-documents.zip")) {
- AUTO_DETECT_PARSER.parse(stream, handler, metadata, recursingContext);
- }
-
- assertEquals("application/zip", metadata.get(Metadata.CONTENT_TYPE));
- String content = handler.toString();
- assertContains("testEXCEL.xls", content);
- assertContains("Sample Excel Worksheet", content);
- assertContains("testHTML.html", content);
- assertContains("Test Indexation Html", content);
- assertContains("testOpenOffice2.odt", content);
- assertContains("This is a sample Open Office document", content);
- assertContains("testPDF.pdf", content);
- assertContains("Apache Tika", content);
- assertContains("testPPT.ppt", content);
- assertContains("Sample Powerpoint Slide", content);
- assertContains("testRTF.rtf", content);
- assertContains("indexation Word", content);
- assertContains("testTXT.txt", content);
- assertContains("Test d'indexation de Txt", content);
- assertContains("testWORD.doc", content);
- assertContains("This is a sample Microsoft Word Document", content);
- assertContains("testXML.xml", content);
- assertContains("Rida Benjelloun", content);
- }
-
/**
* Tests that the ParseContext parser is correctly
* fired for all the embedded entries.
@@ -137,43 +105,6 @@ public class ZipParserTest extends AbstractPkgTest {
assertContains("README", content);
}
- private class GatherRelIDsDocumentExtractor implements EmbeddedDocumentExtractor {
- public Set<String> allRelIDs = new HashSet<String>();
- public boolean shouldParseEmbedded(Metadata metadata) {
- String relID = metadata.get(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID);
- if (relID != null) {
- allRelIDs.add(relID);
- }
- return false;
- }
-
- public void parseEmbedded(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, boolean outputHtml) {
- throw new UnsupportedOperationException("should never be called");
- }
- }
-
- // TIKA-1036
- @Test
- public void testPlaceholders() throws Exception {
- String xml = getXML("testEmbedded.zip").xml;
- assertContains("<div class=\"embedded\" id=\"test1.txt\" />", xml);
- assertContains("<div class=\"embedded\" id=\"test2.txt\" />", xml);
-
- // Also make sure EMBEDDED_RELATIONSHIP_ID was
- // passed when parsing the embedded docs:
- ParseContext context = new ParseContext();
- GatherRelIDsDocumentExtractor relIDs = new GatherRelIDsDocumentExtractor();
- context.set(EmbeddedDocumentExtractor.class, relIDs);
- try (InputStream input = getResourceAsStream("/test-documents/testEmbedded.zip")) {
- AUTO_DETECT_PARSER.parse(input,
- new BodyContentHandler(),
- new Metadata(),
- context);
- }
-
- assertTrue(relIDs.allRelIDs.contains("test1.txt"));
- assertTrue(relIDs.allRelIDs.contains("test2.txt"));
- }
@Test // TIKA-936
public void testCustomEncoding() throws Exception {
@@ -198,31 +129,6 @@ public class ZipParserTest extends AbstractPkgTest {
}
@Test
- public void testZipEncrypted() throws Exception {
- List<Metadata> metadataList = getRecursiveMetadata("testZipEncrypted.zip");
- assertEquals(2, metadataList.size());
- String[] values = metadataList.get(0).getValues(TikaCoreProperties.TIKA_META_EXCEPTION_EMBEDDED_STREAM);
- assertNotNull(values);
- assertEquals(1, values.length);
- assertContains("EncryptedDocumentException: stream (encrypted.txt) is encrypted", values[0]);
-
-
- assertContains("hello world", metadataList.get(1).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
- }
-
- @Test
- public void testKMZDetection() throws Exception {
- List<Metadata> metadataList = getRecursiveMetadata("testKMZ.kmz");
- assertEquals("application/vnd.google-earth.kmz", metadataList.get(0).get(HttpHeaders.CONTENT_TYPE));
- }
-
- @Test
- public void testJARDetection() throws Exception {
- List<Metadata> metadataList = getRecursiveMetadata("testJAR.jar");
- assertEquals("application/java-archive", metadataList.get(0).get(HttpHeaders.CONTENT_TYPE));
- }
-
- @Test
public void testQuineRecursiveParserWrapper() throws Exception {
//received permission from author via dm
//2019-07-25 to include
diff --git a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZlibParserTest.java b/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZlibParserTest.java
index 5d2faaf..30e3868 100644
--- a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZlibParserTest.java
+++ b/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZlibParserTest.java
@@ -29,21 +29,6 @@ import org.xml.sax.ContentHandler;
* Test case for parsing zlib compressed
*/
public class ZlibParserTest extends AbstractPkgTest {
- @Test
- public void testZlibParsing() throws Exception {
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = ZipParserTest.class.getResourceAsStream(
- "/test-documents/testTXT.zlib")) {
- AUTO_DETECT_PARSER.parse(stream, handler, metadata, recursingContext);
- }
-
- assertEquals("application/zlib", metadata.get(Metadata.CONTENT_TYPE));
- String content = handler.toString();
- assertContains("Test d'indexation de Txt", content);
- assertContains("http://www.apache.org", content);
- }
/**
* Tests that the ParseContext parser is correctly
diff --git a/tika-parsers/src/test/resources/test-documents/full_encrypted.7z b/tika-parser-modules/tika-parser-pkg-module/src/test/resources/test-documents/full_encrypted.7z
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/full_encrypted.7z
rename to tika-parser-modules/tika-parser-pkg-module/src/test/resources/test-documents/full_encrypted.7z
diff --git a/tika-parsers/src/test/resources/test-documents/moby.zip b/tika-parser-modules/tika-parser-pkg-module/src/test/resources/test-documents/moby.zip
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/moby.zip
rename to tika-parser-modules/tika-parser-pkg-module/src/test/resources/test-documents/moby.zip
diff --git a/tika-parsers/src/test/resources/test-documents/test7Z_protected_passTika.7z b/tika-parser-modules/tika-parser-pkg-module/src/test/resources/test-documents/test7Z_protected_passTika.7z
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/test7Z_protected_passTika.7z
rename to tika-parser-modules/tika-parser-pkg-module/src/test/resources/test-documents/test7Z_protected_passTika.7z
diff --git a/tika-parsers/src/test/resources/test-documents/testBROTLI_compressed.br b/tika-parser-modules/tika-parser-pkg-module/src/test/resources/test-documents/testBROTLI_compressed.br
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testBROTLI_compressed.br
rename to tika-parser-modules/tika-parser-pkg-module/src/test/resources/test-documents/testBROTLI_compressed.br
diff --git a/tika-parsers/src/test/resources/test-documents/testEmbedded.zip b/tika-parser-modules/tika-parser-pkg-module/src/test/resources/test-documents/testEmbedded.zip
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testEmbedded.zip
rename to tika-parser-modules/tika-parser-pkg-module/src/test/resources/test-documents/testEmbedded.zip
diff --git a/tika-parsers/src/test/resources/test-documents/testJAR_with_HTML.jar b/tika-parser-modules/tika-parser-pkg-module/src/test/resources/test-documents/testJAR_with_HTML.jar
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testJAR_with_HTML.jar
rename to tika-parser-modules/tika-parser-pkg-module/src/test/resources/test-documents/testJAR_with_HTML.jar
diff --git a/tika-parsers/src/test/resources/test-documents/testJAR_with_PEHDR.jar b/tika-parser-modules/tika-parser-pkg-module/src/test/resources/test-documents/testJAR_with_PEHDR.jar
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testJAR_with_PEHDR.jar
rename to tika-parser-modules/tika-parser-pkg-module/src/test/resources/test-documents/testJAR_with_PEHDR.jar
diff --git a/tika-parsers/src/test/resources/test-documents/testSnappy-framed.sz b/tika-parser-modules/tika-parser-pkg-module/src/test/resources/test-documents/testSnappy-framed.sz
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testSnappy-framed.sz
rename to tika-parser-modules/tika-parser-pkg-module/src/test/resources/test-documents/testSnappy-framed.sz
diff --git a/tika-parsers/src/test/resources/test-documents/testTXT.zlib b/tika-parser-modules/tika-parser-pkg-module/src/test/resources/test-documents/testTXT.zlib
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testTXT.zlib
rename to tika-parser-modules/tika-parser-pkg-module/src/test/resources/test-documents/testTXT.zlib
diff --git a/tika-parsers/src/test/resources/test-documents/testTXT.zlib0 b/tika-parser-modules/tika-parser-pkg-module/src/test/resources/test-documents/testTXT.zlib0
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testTXT.zlib0
rename to tika-parser-modules/tika-parser-pkg-module/src/test/resources/test-documents/testTXT.zlib0
diff --git a/tika-parsers/src/test/resources/test-documents/testTXT.zlib5 b/tika-parser-modules/tika-parser-pkg-module/src/test/resources/test-documents/testTXT.zlib5
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testTXT.zlib5
rename to tika-parser-modules/tika-parser-pkg-module/src/test/resources/test-documents/testTXT.zlib5
diff --git a/tika-parsers/src/test/resources/test-documents/testTXT.zlib9 b/tika-parser-modules/tika-parser-pkg-module/src/test/resources/test-documents/testTXT.zlib9
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testTXT.zlib9
rename to tika-parser-modules/tika-parser-pkg-module/src/test/resources/test-documents/testTXT.zlib9
diff --git a/tika-parser-modules/tika-parser-text-module/pom.xml b/tika-parser-modules/tika-parser-text-module/pom.xml
index c65dbd8..c92d22e 100644
--- a/tika-parser-modules/tika-parser-text-module/pom.xml
+++ b/tika-parser-modules/tika-parser-text-module/pom.xml
@@ -58,6 +58,24 @@
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <configuration>
+ <archive>
+ <manifestEntries>
+ <Automatic-Module-Name>org.apache.tika.parser.txt</Automatic-Module-Name>
+ </manifestEntries>
+ </archive>
+ </configuration>
+ <executions>
+ <execution>
+ <goals>
+ <goal>test-jar</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
</plugin>
</plugins>
diff --git a/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/ZipContainerDetector.java b/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
similarity index 74%
copy from tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/ZipContainerDetector.java
copy to tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
index 9b58de2..cadf38d 100644
--- a/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/ZipContainerDetector.java
+++ b/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
@@ -18,13 +18,15 @@ package org.apache.tika.detect.zip;
import org.apache.commons.compress.archivers.ArchiveException;
import org.apache.commons.compress.archivers.ArchiveStreamFactory;
+import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.commons.compress.archivers.zip.ZipFile;
import org.apache.commons.compress.compressors.CompressorException;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.apache.tika.config.Field;
import org.apache.tika.config.ServiceLoader;
-import org.apache.tika.detect.DefaultEncodingDetector;
import org.apache.tika.detect.Detector;
+import org.apache.tika.io.CloseShieldInputStream;
import org.apache.tika.io.IOUtils;
import org.apache.tika.io.LookaheadInputStream;
import org.apache.tika.io.TikaInputStream;
@@ -36,10 +38,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.util.List;
-public class ZipContainerDetector implements Detector {
-
-
-
+public class DefaultZipContainerDetector implements Detector {
//Regrettably, some tiff files can be incorrectly identified
//as tar files. We need this ugly workaround to rule out TIFF.
@@ -61,25 +60,25 @@ public class ZipContainerDetector implements Detector {
@Field
int markLimit = 16 * 1024 * 1024;
- List<ZipDetector> zipDetectors;
+ List<ZipContainerDetector> zipDetectors;
- public ZipContainerDetector() {
- this(new ServiceLoader(DefaultEncodingDetector.class.getClassLoader()));
+ public DefaultZipContainerDetector() {
+ this(new ServiceLoader(DefaultZipContainerDetector.class.getClassLoader()));
}
- public ZipContainerDetector(ServiceLoader loader) {
- this(loader.loadServiceProviders(ZipDetector.class));
+ public DefaultZipContainerDetector(ServiceLoader loader) {
+ this(loader.loadServiceProviders(ZipContainerDetector.class));
}
- public ZipContainerDetector(List<ZipDetector> zipDetectors) {
- //OPCBased needs to be last!!!
+ public DefaultZipContainerDetector(List<ZipContainerDetector> zipDetectors) {
+ //TODO: OPCBased needs to be last!!!
this.zipDetectors = zipDetectors;
}
/**
* If this is less than 0, the file will be spooled to disk,
* and detection will run on the full file.
- * If this is greater than 0, the {@link StreamingZipContainerDetector}
+ * If this is greater than 0, the {@link DeprecatedStreamingZipContainerDetector}
* will be called only up to the markLimit.
*
* @param markLimit mark limit for streaming detection
@@ -121,15 +120,13 @@ public class ZipContainerDetector implements Detector {
}
try (LookaheadInputStream lookahead = new LookaheadInputStream(input, markLimit)) {
- //TODO: figure out this one
- //return streamingZipContainerDetector.detect(lookahead, metadata);
+ return detectStreaming(lookahead, metadata);
}
} else if (!type.equals(MediaType.OCTET_STREAM)) {
return type;
} else {
return detectCompressorFormat(prefix, length);
}
- return PackageConstants.ZIP;
}
/**
@@ -144,7 +141,7 @@ public class ZipContainerDetector implements Detector {
ZipFile zip = new ZipFile(tis.getFile()); // TODO: hasFile()?
try{
- for (ZipDetector zipDetector : zipDetectors) {
+ for (ZipContainerDetector zipDetector : zipDetectors) {
MediaType type = zipDetector.detect(zip, tis);
if (type != null) {
return type;
@@ -208,4 +205,42 @@ public class ZipContainerDetector implements Detector {
return MediaType.OCTET_STREAM;
}
}
+
+ private MediaType detectStreaming(InputStream input, Metadata metadata) throws IOException {
+ StreamingDetectContext detectContext = new StreamingDetectContext();
+ try (
+ ZipArchiveInputStream zis =
+ new ZipArchiveInputStream(new CloseShieldInputStream(input))) {
+ ZipArchiveEntry zae = zis.getNextZipEntry();
+ while (zae != null) {
+ MediaType mt = detect(zae, zis, detectContext);
+ if (mt != null) {
+ return mt;
+ }
+ zae = zis.getNextZipEntry();
+ }
+ }
+ return finalDetect(detectContext);
+ }
+
+
+ private MediaType detect(ZipArchiveEntry zae, ZipArchiveInputStream zis, StreamingDetectContext detectContext) {
+ for (ZipContainerDetector d : zipDetectors) {
+ MediaType mt = d.streamingDetectUpdate(zae, zis, detectContext);
+ if (mt != null) {
+ return mt;
+ }
+ }
+ return null;
+ }
+
+ private MediaType finalDetect(StreamingDetectContext detectContext) {
+ for (ZipContainerDetector d : zipDetectors) {
+ MediaType mt = d.streamingDetectFinal(detectContext);
+ if (mt != null) {
+ return mt;
+ }
+ }
+ return MediaType.APPLICATION_ZIP;
+ }
}
diff --git a/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/StreamingZipContainerDetector.java b/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DeprecatedStreamingZipContainerDetector.java
similarity index 99%
rename from tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/StreamingZipContainerDetector.java
rename to tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DeprecatedStreamingZipContainerDetector.java
index f0e9493..63b4191 100644
--- a/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/StreamingZipContainerDetector.java
+++ b/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DeprecatedStreamingZipContainerDetector.java
@@ -40,7 +40,7 @@ import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
-public class StreamingZipContainerDetector extends ZipContainerDetectorBase implements Detector {
+public class DeprecatedStreamingZipContainerDetector extends ZipContainerDetectorBase implements Detector {
private static final int MAX_MIME_TYPE = 1024;
private static final int MAX_MANIFEST = 20 * 1024 * 1024;
diff --git a/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/IPADetector.java b/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/IPADetector.java
index 3702ffe..78585a9 100644
--- a/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/IPADetector.java
+++ b/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/IPADetector.java
@@ -17,18 +17,22 @@
package org.apache.tika.detect.zip;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.commons.compress.archivers.zip.ZipFile;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.mime.MediaType;
import java.io.IOException;
+import java.io.InputStream;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.regex.Pattern;
-public class IPADetector implements ZipDetector {
+public class IPADetector implements ZipContainerDetector {
+
+ static final MediaType IPA = MediaType.application("x-itunes-ipa");
/**
* To be considered as an IPA file, it needs to match all of these
@@ -47,20 +51,20 @@ public class IPADetector implements ZipDetector {
@Override
public MediaType detect(ZipFile zip, TikaInputStream tis) throws IOException {
// Note - consider generalising this logic, if another format needs many regexp matching
- Set<Pattern> tmpPatterns = (Set<Pattern>)ipaEntryPatterns.clone();
+ TmpPatterns tmpPatterns = new TmpPatterns();
Enumeration<ZipArchiveEntry> entries = zip.getEntries();
while (entries.hasMoreElements()) {
ZipArchiveEntry entry = entries.nextElement();
String name = entry.getName();
- Iterator<Pattern> ip = tmpPatterns.iterator();
+ Iterator<Pattern> ip = tmpPatterns.patterns.iterator();
while (ip.hasNext()) {
if (ip.next().matcher(name).matches()) {
ip.remove();
}
}
- if (tmpPatterns.isEmpty()) {
+ if (tmpPatterns.patterns.isEmpty()) {
// We've found everything we need to find
return MediaType.application("x-itunes-ipa");
}
@@ -70,4 +74,46 @@ public class IPADetector implements ZipDetector {
return null;
}
+
+ @Override
+ public MediaType streamingDetectUpdate(ZipArchiveEntry zae,
+ InputStream zis,
+ StreamingDetectContext detectContext) {
+ String name = zae.getName();
+ TmpPatterns tmp = detectContext.get(TmpPatterns.class);
+ if (tmp == null) {
+ tmp = new TmpPatterns();
+ detectContext.set(TmpPatterns.class, tmp);
+ }
+
+ Iterator<Pattern> ip = tmp.patterns.iterator();
+ while (ip.hasNext()) {
+ if (ip.next().matcher(name).matches()) {
+ ip.remove();
+ }
+ }
+ if (tmp.patterns.isEmpty()) {
+ // We've found everything we need to find
+ return IPA;
+ }
+ return null;
+ }
+
+ @Override
+ public MediaType streamingDetectFinal(StreamingDetectContext detectContext) {
+ TmpPatterns tmp = detectContext.get(TmpPatterns.class);
+ if (tmp == null) {
+ return null;
+ }
+ if (tmp.patterns.isEmpty()) {
+ // We've found everything we need to find
+ return IPA;
+ }
+ detectContext.remove(TmpPatterns.class);
+ return null;
+ }
+
+ private static class TmpPatterns {
+ Set<Pattern> patterns = (Set<Pattern>)ipaEntryPatterns.clone();
+ }
}
diff --git a/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/JarDetector.java b/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/JarDetector.java
index cf6445d..60cc95c 100644
--- a/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/JarDetector.java
+++ b/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/JarDetector.java
@@ -16,13 +16,19 @@
*/
package org.apache.tika.detect.zip;
+import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.commons.compress.archivers.zip.ZipFile;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.mime.MediaType;
import java.io.IOException;
+import java.io.InputStream;
+
+public class JarDetector implements ZipContainerDetector {
+
+ private static SeenManifest SEEN_MANIFEST = new SeenManifest();
-public class JarDetector implements ZipDetector {
@Override
public MediaType detect(ZipFile zip, TikaInputStream tis) throws IOException {
if (zip.getEntry("META-INF/MANIFEST.MF") != null) {
@@ -52,4 +58,46 @@ public class JarDetector implements ZipDetector {
return null;
}
}
+
+ @Override
+ public MediaType streamingDetectUpdate(ZipArchiveEntry zae,
+ InputStream zis,
+ StreamingDetectContext detectContext) {
+
+ String name = zae.getName();
+ if (name.equals("AndroidManifest.xml")) {
+ return MediaType.application("vnd.android.package-archive");
+ } else if (name.equals("META-INF/MANIFEST.MF")) {
+ // It's a Jar file, or something based on Jar
+ detectContext.set(SeenManifest.class, SEEN_MANIFEST);
+ }
+ SeenManifest seenManifest = detectContext.get(SeenManifest.class);
+
+ if (seenManifest != null) {
+ if (name.equals("AndroidManifest.xml")) {
+ // Is it an Android APK?
+ return MediaType.application("vnd.android.package-archive");
+ } else if (name.equals("WEB-INF/")) {
+ // Check for WAR and EAR
+ return MediaType.application("x-tika-java-web-archive");
+ }
+ if (name.equals("META-INF/application.xml")) {
+ return MediaType.application("x-tika-java-enterprise-archive");
+ }
+ }
+ return null;
+
+ }
+
+ @Override
+ public MediaType streamingDetectFinal(StreamingDetectContext detectContext) {
+ if (detectContext.get(SeenManifest.class) != null) {
+ // Looks like a regular Jar Archive
+ return MediaType.application("java-archive");
+
+ }
+ return null;
+ }
+
+ private static class SeenManifest { }
}
diff --git a/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/KMZDetector.java b/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/KMZDetector.java
index f848d6a..e2cf83a 100644
--- a/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/KMZDetector.java
+++ b/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/KMZDetector.java
@@ -17,14 +17,18 @@
package org.apache.tika.detect.zip;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.commons.compress.archivers.zip.ZipFile;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.mime.MediaType;
import java.io.IOException;
+import java.io.InputStream;
import java.util.Enumeration;
-public class KMZDetector implements ZipDetector {
+import static org.apache.tika.detect.zip.PackageConstants.KMZ;
+
+public class KMZDetector implements ZipContainerDetector {
@Override
public MediaType detect(ZipFile zip, TikaInputStream tis) throws IOException {
boolean kmlFound = false;
@@ -49,4 +53,46 @@ public class KMZDetector implements ZipDetector {
return null;
}
}
+
+ @Override
+ public MediaType streamingDetectUpdate(ZipArchiveEntry zae,
+ InputStream zis, StreamingDetectContext detectContext) {
+ String name = zae.getName();
+
+ if (name.indexOf('/') != -1
+ || name.indexOf('\\') != -1) {
+ return null;
+ }
+ if (name.endsWith(".kml")) {
+ KMLCounter counter = detectContext.get(KMLCounter.class);
+ if (counter == null) {
+ counter = new KMLCounter();
+ detectContext.set(KMLCounter.class, counter);
+ }
+ counter.increment();
+ }
+ return null;
+ }
+
+ @Override
+ public MediaType streamingDetectFinal(StreamingDetectContext detectContext) {
+ if (detectContext.get(KMLCounter.class) != null) {
+ if (detectContext.get(KMLCounter.class).getCount() == 1) {
+ return KMZ;
+ }
+ }
+ return null;
+ }
+
+ private static class KMLCounter {
+ private int cnt = 0;
+
+ int getCount() {
+ return cnt;
+ }
+
+ void increment() {
+ cnt++;
+ }
+ }
}
diff --git a/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/OpenDocumentDetector.java b/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/OpenDocumentDetector.java
index e0b21d9..c7b61a0 100644
--- a/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/OpenDocumentDetector.java
+++ b/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/OpenDocumentDetector.java
@@ -17,6 +17,7 @@
package org.apache.tika.detect.zip;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.commons.compress.archivers.zip.ZipFile;
import org.apache.tika.io.IOUtils;
import org.apache.tika.io.TikaInputStream;
@@ -24,10 +25,11 @@ import org.apache.tika.mime.MediaType;
import java.io.IOException;
import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
import static java.nio.charset.StandardCharsets.UTF_8;
-public class OpenDocumentDetector implements ZipDetector {
+public class OpenDocumentDetector implements ZipContainerDetector {
@Override
public MediaType detect(ZipFile zip, TikaInputStream tis) throws IOException {
try {
@@ -43,4 +45,22 @@ public class OpenDocumentDetector implements ZipDetector {
return null;
}
}
+
+ @Override
+ public MediaType streamingDetectUpdate(ZipArchiveEntry zae, InputStream zis, StreamingDetectContext detectContext) {
+ String name = zae.getName();
+ if ("mimetype".equals(name)) {
+ try {
+ return MediaType.parse(IOUtils.toString(zis, UTF_8));
+ } catch (IOException e) {
+ return null;
+ }
+ }
+ return null;
+ }
+
+ @Override
+ public MediaType streamingDetectFinal(StreamingDetectContext detectContext) {
+ return null;
+ }
}
diff --git a/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/StarOfficeDetector.java b/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/StarOfficeDetector.java
index bc484fe..bae797c 100644
--- a/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/StarOfficeDetector.java
+++ b/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/StarOfficeDetector.java
@@ -17,24 +17,115 @@
package org.apache.tika.detect.zip;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.commons.compress.archivers.zip.ZipFile;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.StoppingEarlyException;
+import org.apache.tika.utils.XMLReaderUtils;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
import java.io.IOException;
import java.io.InputStream;
+import java.util.HashMap;
+import java.util.Map;
+
+public class StarOfficeDetector implements ZipContainerDetector {
+
+ static final Map<String, MediaType> STAR_OFFICE_X = new HashMap<>();
+
+ static {
+ STAR_OFFICE_X.put("application/vnd.sun.xml.writer",
+ MediaType.application("vnd.sun.xml.writer"));
+ STAR_OFFICE_X.put("application/vnd.sun.xml.calc",
+ MediaType.application("vnd.sun.xml.calc"));
+ STAR_OFFICE_X.put("application/vnd.sun.xml.draw",
+ MediaType.application("vnd.sun.xml.draw"));
+ STAR_OFFICE_X.put("application/vnd.sun.xml.impress",
+ MediaType.application("vnd.sun.xml.impress"));
+ STAR_OFFICE_X.put("application/vnd.sun.star.configuration-data",
+ MediaType.application("vnd.openofficeorg.extension"));
+ }
+
+ static final MediaType BAU =
+ MediaType.application("vnd.openofficeorg.autotext");
+
-public class StarOfficeDetector implements ZipDetector {
@Override
public MediaType detect(ZipFile zip, TikaInputStream tis) throws IOException {
+
ZipArchiveEntry zae = zip.getEntry("META-INF/manifest.xml");
if (zae == null) {
return null;
}
+ return detectStarOfficeX(zip.getInputStream(zae));
+ }
+
+ @Override
+ public MediaType streamingDetectUpdate(ZipArchiveEntry zae, InputStream zis,
+ StreamingDetectContext detectContext) {
+ String name = zae.getName();
+ if (! "META-INF/manifest.xml".equals(name)) {
+ return null;
+ }
- try (InputStream is = zip.getInputStream(zae)) {
- return ZipContainerDetectorBase.detectStarOfficeX(is);
+ return detectStarOfficeX(zis);
+
+ }
+
+ @Override
+ public MediaType streamingDetectFinal(StreamingDetectContext detectContext) {
+ return null;
+ }
+
+ //parse the META-INF/content.xml file
+ static MediaType detectStarOfficeX(InputStream is) {
+ StarOfficeXHandler handler = new StarOfficeXHandler();
+ try {
+ XMLReaderUtils.parseSAX(is,
+ new OfflineContentHandler(handler),
+ new ParseContext());
+ } catch (SecurityException e) {
+ throw e;
+ } catch (Exception e) {
}
+ return handler.mediaType;
}
+
+ private static class StarOfficeXHandler extends DefaultHandler {
+
+ private MediaType mediaType = null;
+
+ @Override
+ public void startElement(String uri, String localName,
+ String name, Attributes attrs) throws SAXException {
+ if (! "file-entry".equals(localName)) {
+ return;
+ }
+ String mediaTypeString = null;
+ String fullPath = null;
+ for (int i = 0; i < attrs.getLength(); i++) {
+ String attrName = attrs.getLocalName(i);
+ if (attrName.equals("media-type")) {
+ mediaTypeString = attrs.getValue(i);
+ if (STAR_OFFICE_X.containsKey(mediaTypeString)) {
+ mediaType = STAR_OFFICE_X.get(mediaTypeString);
+ throw StoppingEarlyException.INSTANCE;
+ }
+ } else if (attrName.equals("full-path")) {
+ fullPath = attrs.getValue(i);
+ }
+ }
+ if ("".equals(mediaTypeString) && "/".equals(fullPath)) {
+ mediaType = BAU;
+ throw StoppingEarlyException.INSTANCE;
+ }
+ }
+ }
+
}
diff --git a/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/StreamingDetectContext.java b/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/StreamingDetectContext.java
new file mode 100644
index 0000000..dc6e3f1
--- /dev/null
+++ b/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/StreamingDetectContext.java
@@ -0,0 +1,62 @@
+package org.apache.tika.detect.zip;
+
+import java.util.HashMap;
+import java.util.Map;
+
+class StreamingDetectContext {
+
+ /** Serial version UID. */
+ private static final long serialVersionUID = -5921436862145826534L;
+
+ /** Map of objects in this context */
+ private final Map<String, Object> context = new HashMap<String, Object>();
+
+ /**
+ * Adds the given value to the context as an implementation of the given
+ * interface.
+ *
+ * @param key the interface implemented by the given value
+ * @param value the value to be added, or <code>null</code> to remove
+ */
+ public <T> void set(Class<T> key, T value) {
+ if (value != null) {
+ context.put(key.getName(), value);
+ } else {
+ context.remove(key.getName());
+ }
+ }
+
+ /**
+ * Returns the object in this context that implements the given interface.
+ *
+ * @param key the interface implemented by the requested object
+ * @return the object that implements the given interface,
+ * or <code>null</code> if not found
+ */
+ @SuppressWarnings("unchecked")
+ public <T> T get(Class<T> key) {
+ return (T) context.get(key.getName());
+ }
+
+ /**
+ * Returns the object in this context that implements the given interface,
+ * or the given default value if such an object is not found.
+ *
+ * @param key the interface implemented by the requested object
+ * @param defaultValue value to return if the requested object is not found
+ * @return the object that implements the given interface,
+ * or the given default value if not found
+ */
+ public <T> T get(Class<T> key, T defaultValue) {
+ T value = get(key);
+ if (value != null) {
+ return value;
+ } else {
+ return defaultValue;
+ }
+ }
+
+ public void remove(Class key) {
+ context.remove(key);
+ }
+}
diff --git a/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/ZipContainerDetector.java b/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/ZipContainerDetector.java
index 9b58de2..2db5e94 100644
--- a/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/ZipContainerDetector.java
+++ b/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/ZipContainerDetector.java
@@ -16,196 +16,50 @@
*/
package org.apache.tika.detect.zip;
-import org.apache.commons.compress.archivers.ArchiveException;
-import org.apache.commons.compress.archivers.ArchiveStreamFactory;
+import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.commons.compress.archivers.zip.ZipFile;
-import org.apache.commons.compress.compressors.CompressorException;
-import org.apache.commons.compress.compressors.CompressorStreamFactory;
-import org.apache.tika.config.Field;
-import org.apache.tika.config.ServiceLoader;
-import org.apache.tika.detect.DefaultEncodingDetector;
-import org.apache.tika.detect.Detector;
-import org.apache.tika.io.IOUtils;
-import org.apache.tika.io.LookaheadInputStream;
import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
-import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
-import java.util.List;
-public class ZipContainerDetector implements Detector {
-
-
-
-
- //Regrettably, some tiff files can be incorrectly identified
- //as tar files. We need this ugly workaround to rule out TIFF.
- //If commons-compress ever chooses to take over TIFF detection
- //we can remove all of this. See TIKA-2591.
- private final static MediaType TIFF = MediaType.image("tiff");
- private final static byte[][] TIFF_SIGNATURES = new byte[3][];
- static {
- TIFF_SIGNATURES[0] = new byte[]{'M','M',0x00,0x2a};
- TIFF_SIGNATURES[1] = new byte[]{'I','I',0x2a, 0x00};
- TIFF_SIGNATURES[2] = new byte[]{'M','M', 0x00, 0x2b};
- }
-
- /** Serial version UID */
- private static final long serialVersionUID = 2891763938430295453L;
-
- //this has to be > 100,000 to handle some of the iworks files
- //in our unit tests
- @Field
- int markLimit = 16 * 1024 * 1024;
-
- List<ZipDetector> zipDetectors;
-
- public ZipContainerDetector() {
- this(new ServiceLoader(DefaultEncodingDetector.class.getClassLoader()));
- }
-
- public ZipContainerDetector(ServiceLoader loader) {
- this(loader.loadServiceProviders(ZipDetector.class));
- }
+/**
+ * Classes that implement this must be able to detect on a ZipFile and in streaming mode.
+ * In streaming mode, each ziparchiventry is "updated" and then
+ * {@link #streamingDetectFinal(StreamingDetectContext)} is
+ * called for a final decision.
+ *
+ * During streaming detection, state is stored in the StreamingDetectContext
+ */
+public interface ZipContainerDetector {
- public ZipContainerDetector(List<ZipDetector> zipDetectors) {
- //OPCBased needs to be last!!!
- this.zipDetectors = zipDetectors;
- }
+ /**
+ * If detection is successful, the ZipDetector should set the zip
+ * file or OPCPackage in TikaInputStream.setOpenContainer()
+ * @param zipFile
+ * @param tis
+ * @return
+ * @throws IOException
+ */
+ MediaType detect(ZipFile zipFile, TikaInputStream tis) throws IOException;
/**
- * If this is less than 0, the file will be spooled to disk,
- * and detection will run on the full file.
- * If this is greater than 0, the {@link StreamingZipContainerDetector}
- * will be called only up to the markLimit.
+ * Try to detect on a specific entry. Detectors are allowed to store
+ * state (e.g. "remember what they've seen") in the {@link StreamingDetectContext}
*
- * @param markLimit mark limit for streaming detection
+ * @param zae
+ * @return
*/
- public void setMarkLimit(int markLimit) {
- this.markLimit = markLimit;
- }
-
- @Override
- public MediaType detect(InputStream input, Metadata metadata) throws IOException {
- // Check if we have access to the document
- if (input == null) {
- return MediaType.OCTET_STREAM;
- }
-
- byte[] prefix = new byte[1024]; // enough for all known archive formats
- input.mark(1024);
- int length = -1;
- try {
- length = IOUtils.read(input, prefix, 0, 1024);
- } finally {
- input.reset();
- }
-
- MediaType type = detectArchiveFormat(prefix, length);
-
- if (type == TIFF) {
- return TIFF;
- } else if (isZipArchive(type)) {
-
- if (TikaInputStream.isTikaInputStream(input)) {
- TikaInputStream tis = TikaInputStream.cast(input);
- if (markLimit < 0) {
- tis.getFile();
- }
- if (tis.hasFile()) {
- return detectZipFormatOnFile(tis);
- }
- }
-
- try (LookaheadInputStream lookahead = new LookaheadInputStream(input, markLimit)) {
- //TODO: figure out this one
- //return streamingZipContainerDetector.detect(lookahead, metadata);
- }
- } else if (!type.equals(MediaType.OCTET_STREAM)) {
- return type;
- } else {
- return detectCompressorFormat(prefix, length);
- }
- return PackageConstants.ZIP;
- }
+ MediaType streamingDetectUpdate(ZipArchiveEntry zae, InputStream zis, StreamingDetectContext detectContext);
/**
- * This will call TikaInputStream's getFile(). If there are no exceptions,
- * it will place the ZipFile in TikaInputStream's openContainer and leave it
- * open.
- * @param tis
+ * After we've finished streaming the zip archive entries,
+ * a detector may make a final decision.
+ *
* @return
*/
- private MediaType detectZipFormatOnFile(TikaInputStream tis) {
- try {
- ZipFile zip = new ZipFile(tis.getFile()); // TODO: hasFile()?
-
- try{
- for (ZipDetector zipDetector : zipDetectors) {
- MediaType type = zipDetector.detect(zip, tis);
- if (type != null) {
- return type;
- }
- }
- } finally {
- tis.setOpenContainer(zip);
- }
-
- } catch (IOException e) {
- // ignore
- }
- // Fallback: it's still a zip file, we just don't know what kind of one
- return MediaType.APPLICATION_ZIP;
- }
-
-
- static boolean isZipArchive(MediaType type) {
- return type.equals(PackageConstants.ZIP)
- || type.equals(PackageConstants.JAR);
- }
-
- private static boolean isTiff(byte[] prefix) {
- for (byte[] sig : TIFF_SIGNATURES) {
- if(arrayStartWith(sig, prefix)) {
- return true;
- }
- }
- return false;
- }
-
- private static boolean arrayStartWith(byte[] needle, byte[] haystack) {
- if (haystack.length < needle.length) {
- return false;
- }
- for (int i = 0; i < needle.length; i++) {
- if (haystack[i] != needle[i]) {
- return false;
- }
- }
- return true;
- }
-
- private static MediaType detectArchiveFormat(byte[] prefix, int length) {
- if (isTiff(prefix)) {
- return TIFF;
- }
- try {
- String name = ArchiveStreamFactory.detect(new ByteArrayInputStream(prefix, 0, length));
- return PackageConstants.getMediaType(name);
- } catch (ArchiveException e) {
- return MediaType.OCTET_STREAM;
- }
- }
+ MediaType streamingDetectFinal(StreamingDetectContext detectContext);
- private static MediaType detectCompressorFormat(byte[] prefix, int length) {
- try {
- String type = CompressorStreamFactory.detect(new ByteArrayInputStream(prefix, 0, length));
- return CompressorConstants.getMediaType(type);
- } catch (CompressorException e) {
- return MediaType.OCTET_STREAM;
- }
- }
}
diff --git a/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/ZipContainerDetectorBase.java b/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/ZipContainerDetectorBase.java
index b9e324e..db38843 100644
--- a/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/ZipContainerDetectorBase.java
+++ b/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/ZipContainerDetectorBase.java
@@ -74,8 +74,6 @@ abstract class ZipContainerDetectorBase {
static final MediaType XPS =
MediaType.application("vnd.ms-xpsdocument");
- static final MediaType BAU =
- MediaType.application("vnd.openofficeorg.autotext");
static final Set<String> OOXML_HINTS = fillSet(
"word/document.xml",
@@ -88,20 +86,9 @@ abstract class ZipContainerDetectorBase {
"xl/worksheets/sheet1.xml"
);
- static final Map<String, MediaType> STAR_OFFICE_X = new HashMap<>();
-
- static {
- STAR_OFFICE_X.put("application/vnd.sun.xml.writer",
- MediaType.application("vnd.sun.xml.writer"));
- STAR_OFFICE_X.put("application/vnd.sun.xml.calc",
- MediaType.application("vnd.sun.xml.calc"));
- STAR_OFFICE_X.put("application/vnd.sun.xml.draw",
- MediaType.application("vnd.sun.xml.draw"));
- STAR_OFFICE_X.put("application/vnd.sun.xml.impress",
- MediaType.application("vnd.sun.xml.impress"));
- STAR_OFFICE_X.put("application/vnd.sun.star.configuration-data",
- MediaType.application("vnd.openofficeorg.extension"));
- }
+
+
+
private static Set<String> fillSet(String ... args) {
Set<String> tmp = new HashSet<>();
for (String arg : args) {
@@ -154,7 +141,6 @@ abstract class ZipContainerDetectorBase {
}
}
if (kmlFound) {
- return MediaType.application("vnd.google-earth.kmz");
}
return null;
}
@@ -193,55 +179,6 @@ abstract class ZipContainerDetectorBase {
return null;
}
- //parse the META-INF/content.xml file
- static MediaType detectStarOfficeX(InputStream is) {
- StarOfficeXHandler handler = new StarOfficeXHandler();
- try {
- XMLReaderUtils.parseSAX(is,
- new OfflineContentHandler(handler),
- new ParseContext());
- } catch (SecurityException e) {
- throw e;
- } catch (Exception e) {
- }
- return handler.mediaType;
- }
-
- private static class StarOfficeXHandler extends DefaultHandler {
-
- private MediaType mediaType = null;
-
- @Override
- public void startElement(String uri, String localName,
- String name, Attributes attrs) throws SAXException {
- if (! "file-entry".equals(localName)) {
- return;
- }
- String mediaTypeString = null;
- String fullPath = null;
- for (int i = 0; i < attrs.getLength(); i++) {
- String attrName = attrs.getLocalName(i);
- if (attrName.equals("media-type")) {
- mediaTypeString = attrs.getValue(i);
- if (STAR_OFFICE_X.containsKey(mediaTypeString)) {
- mediaType = STAR_OFFICE_X.get(mediaTypeString);
- throw new StoppingEarlyException();
- }
- } else if (attrName.equals("full-path")) {
- fullPath = attrs.getValue(i);
- }
- }
- if ("".equals(mediaTypeString) && "/".equals(fullPath)) {
- mediaType = BAU;
- throw new StoppingEarlyException();
- }
- }
- }
- /**
- * sentinel exception to stop parsing xml once target is found
- */
- static class StoppingEarlyException extends SAXException {
- }
}
diff --git a/tika-parser-modules/tika-parser-zip-commons/src/main/resources/META-INF/services/org.apache.tika.detect.Detector b/tika-parser-modules/tika-parser-zip-commons/src/main/resources/META-INF/services/org.apache.tika.detect.Detector
new file mode 100644
index 0000000..bada2c9
--- /dev/null
+++ b/tika-parser-modules/tika-parser-zip-commons/src/main/resources/META-INF/services/org.apache.tika.detect.Detector
@@ -0,0 +1,15 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+org.apache.tika.detect.zip.DefaultZipContainerDetector
\ No newline at end of file
diff --git a/tika-parser-modules/tika-parser-zip-commons/src/main/resources/META-INF/services/org.apache.tika.detect.zip.ZipContainerDetector b/tika-parser-modules/tika-parser-zip-commons/src/main/resources/META-INF/services/org.apache.tika.detect.zip.ZipContainerDetector
new file mode 100644
index 0000000..2f26d0c
--- /dev/null
+++ b/tika-parser-modules/tika-parser-zip-commons/src/main/resources/META-INF/services/org.apache.tika.detect.zip.ZipContainerDetector
@@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+org.apache.tika.detect.zip.IPADetector
+org.apache.tika.detect.zip.JarDetector
+org.apache.tika.detect.zip.KMZDetector
+org.apache.tika.detect.zip.OpenDocumentDetector
+org.apache.tika.detect.zip.StarOfficeDetector
\ No newline at end of file
diff --git a/tika-parser-modules/tika-parser-zip-commons/src/main/resources/META-INF/services/org.apache.tika.detect.zip.ZipDetector b/tika-parser-modules/tika-parser-zip-commons/src/main/resources/META-INF/services/org.apache.tika.detect.zip.ZipDetector
deleted file mode 100644
index 405749b..0000000
--- a/tika-parser-modules/tika-parser-zip-commons/src/main/resources/META-INF/services/org.apache.tika.detect.zip.ZipDetector
+++ /dev/null
@@ -1,5 +0,0 @@
-org.apache.tika.detect.zip.IPADetector
-org.apache.tika.detect.zip.JarDetector
-org.apache.tika.detect.zip.KMZDetector
-org.apache.tika.detect.zip.OpenDocumentDetector
-org.apache.tika.detect.zip.StarOfficeDetector
\ No newline at end of file
diff --git a/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/StarOfficeDetector.java b/tika-parser-modules/tika-parser-zip-commons/src/test/java/org/apache/tika/detect/zip/ZipParserTest.java
similarity index 51%
copy from tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/StarOfficeDetector.java
copy to tika-parser-modules/tika-parser-zip-commons/src/test/java/org/apache/tika/detect/zip/ZipParserTest.java
index bc484fe..c00ff45 100644
--- a/tika-parser-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/StarOfficeDetector.java
+++ b/tika-parser-modules/tika-parser-zip-commons/src/test/java/org/apache/tika/detect/zip/ZipParserTest.java
@@ -16,25 +16,30 @@
*/
package org.apache.tika.detect.zip;
-import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
-import org.apache.commons.compress.archivers.zip.ZipFile;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.mime.MediaType;
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.HttpHeaders;
+import org.apache.tika.metadata.Metadata;
+import org.junit.Test;
-import java.io.IOException;
-import java.io.InputStream;
+import java.util.List;
-public class StarOfficeDetector implements ZipDetector {
- @Override
- public MediaType detect(ZipFile zip, TikaInputStream tis) throws IOException {
- ZipArchiveEntry zae = zip.getEntry("META-INF/manifest.xml");
+import static org.junit.Assert.assertEquals;
- if (zae == null) {
- return null;
- }
+/**
+ * Test case for parsing zip files.
+ */
+public class ZipParserTest extends TikaTest {
+
+
+ @Test
+ public void testKMZDetection() throws Exception {
+ List<Metadata> metadataList = getRecursiveMetadata("testKMZ.kmz");
+ assertEquals("application/vnd.google-earth.kmz", metadataList.get(0).get(HttpHeaders.CONTENT_TYPE));
+ }
- try (InputStream is = zip.getInputStream(zae)) {
- return ZipContainerDetectorBase.detectStarOfficeX(is);
- }
+ @Test
+ public void testJARDetection() throws Exception {
+ List<Metadata> metadataList = getRecursiveMetadata("testJAR.jar");
+ assertEquals("application/java-archive", metadataList.get(0).get(HttpHeaders.CONTENT_TYPE));
}
}
diff --git a/tika-parsers/src/test/resources/test-documents/testJAR.jar b/tika-parser-modules/tika-parser-zip-commons/src/test/resources/test-documents/testJAR.jar
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testJAR.jar
rename to tika-parser-modules/tika-parser-zip-commons/src/test/resources/test-documents/testJAR.jar
diff --git a/tika-parser-modules/tika-parser-pkg-module/src/test/resources/test-documents/testKMZ.kmz b/tika-parser-modules/tika-parser-zip-commons/src/test/resources/test-documents/testKMZ.kmz
similarity index 100%
rename from tika-parser-modules/tika-parser-pkg-module/src/test/resources/test-documents/testKMZ.kmz
rename to tika-parser-modules/tika-parser-zip-commons/src/test/resources/test-documents/testKMZ.kmz
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java
index ce3d638..2dfac4f 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java
@@ -31,10 +31,8 @@ import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
-import org.apache.commons.io.filefilter.TrueFileFilter;
import org.apache.tika.MultiThreadedTikaTest;
import org.apache.tika.Tika;
-import org.apache.tika.TikaTest;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.fork.ForkParser;
@@ -272,7 +270,7 @@ public class ForkParserIntegrationTest extends MultiThreadedTikaTest {
try (ForkParser parser = new ForkParser(ForkParserIntegrationTest.class.getClassLoader(), tika.getParser())) {
ContentHandler output = new BodyContentHandler();
InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream(
- "/test-documents/moby.zip");
+ "/test-documents/moby.zip");
ParseContext context = new ParseContext();
parser.parse(stream, output, new Metadata(), context);
assertContains("Moby Dick", output.toString());