You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/12/08 16:37:48 UTC
[tika] branch master updated: TIKA-2483 -- add in all children of
zip and tar to prevent overwriting of child file types by the
PackageParser. Ensure that our semi-manual list is updated when there are
changes to TikaConfig.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 90d6245 TIKA-2483 -- add in all children of zip and tar to prevent overwriting of child file types by the PackageParser. Ensure that our semi-manual list is updated when there are changes to TikaConfig.
90d6245 is described below
commit 90d624588483f379180c812b7235bb2222be9c2e
Author: tballison <ta...@mitre.org>
AuthorDate: Fri Dec 8 11:37:38 2017 -0500
TIKA-2483 -- add in all children of zip and tar to prevent overwriting of
child file types by the PackageParser. Ensure that our semi-manual list
is updated when there are changes to TikaConfig.
---
.../org/apache/tika/parser/pkg/PackageParser.java | 92 +++++++++++++++++++---
.../apache/tika/parser/pkg/PackageParserTest.java | 27 +++++++
.../org/apache/tika/parser/pkg/ZipParserTest.java | 14 ++++
3 files changed, 124 insertions(+), 9 deletions(-)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
index 32e06c0..6ec201e 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
@@ -16,12 +16,13 @@
*/
package org.apache.tika.parser.pkg;
-import static org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.util.Collections;
import java.util.Date;
+import java.util.HashSet;
import java.util.Set;
import org.apache.commons.compress.PasswordRequiredException;
@@ -83,15 +84,88 @@ public class PackageParser extends AbstractParser {
private static final MediaType TAR = MediaType.application("x-tar");
private static final MediaType SEVENZ = MediaType.application("x-7z-compressed");
- private static final MediaType TIKA_OOXML = MediaType.application("tika-ooxml");
+ private static final MediaType TIKA_OOXML = MediaType.application("x-tika-ooxml");
private static final MediaType GTAR = MediaType.application("x-gtar");
+ private static final MediaType KMZ = MediaType.application("vnd.google-earth.kmz");
private static final Set<MediaType> SUPPORTED_TYPES =
MediaType.set(ZIP, JAR, AR, ARJ, CPIO, DUMP, TAR, SEVENZ);
- private static final Set<MediaType> DONT_OVERWRITE_CONTENT_TYPE =
- MediaType.set(TIKA_OOXML, GTAR);
+ //We used to avoid overwriting file types if the file type
+ //was a specialization of zip/tar. We determined specialization of zip
+ //via TikaConfig at parse time.
+ //However, TIKA-2483 showed that TikaConfig is not serializable
+ //and this causes an exception in the ForkParser.
+ //The following is an inelegant hack, but until we can serialize TikaConfig,
+ //or dramatically rework the ForkParser to avoid serialization
+ //of parsers, this is what we have.
+ //There is at least a test in PackageParserTest that makes sure that we
+ //keep this list updated.
+ static final Set<MediaType> PACKAGE_SPECIALIZATIONS =
+ loadPackageSpecializations();
+
+ static final Set<MediaType> loadPackageSpecializations() {
+ Set<MediaType> zipSpecializations = new HashSet<>();
+ for (String mediaTypeString : new String[]{
+ //specializations of ZIP
+ "application/bizagi-modeler",
+ "application/epub+zip",
+ "application/java-archive",
+ "application/vnd.adobe.air-application-installer-package+zip",
+ "application/vnd.android.package-archive",
+ "application/vnd.apple.iwork",
+ "application/vnd.apple.keynote",
+ "application/vnd.apple.numbers",
+ "application/vnd.apple.pages",
+ "application/vnd.etsi.asic-e+zip",
+ "application/vnd.etsi.asic-s+zip",
+ "application/vnd.google-earth.kmz",
+ "application/vnd.mindjet.mindmanager",
+ "application/vnd.ms-excel.addin.macroenabled.12",
+ "application/vnd.ms-excel.sheet.binary.macroenabled.12",
+ "application/vnd.ms-excel.sheet.macroenabled.12",
+ "application/vnd.ms-excel.template.macroenabled.12",
+ "application/vnd.ms-powerpoint.addin.macroenabled.12",
+ "application/vnd.ms-powerpoint.presentation.macroenabled.12",
+ "application/vnd.ms-powerpoint.slide.macroenabled.12",
+ "application/vnd.ms-powerpoint.slideshow.macroenabled.12",
+ "application/vnd.ms-powerpoint.template.macroenabled.12",
+ "application/vnd.ms-visio.drawing",
+ "application/vnd.ms-visio.drawing.macroenabled.12",
+ "application/vnd.ms-visio.stencil",
+ "application/vnd.ms-visio.stencil.macroenabled.12",
+ "application/vnd.ms-visio.template",
+ "application/vnd.ms-visio.template.macroenabled.12",
+ "application/vnd.ms-word.document.macroenabled.12",
+ "application/vnd.ms-word.template.macroenabled.12",
+ "application/vnd.ms-xpsdocument",
+ "application/vnd.oasis.opendocument.formula",
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+ "application/vnd.openxmlformats-officedocument.presentationml.slide",
+ "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
+ "application/vnd.openxmlformats-officedocument.presentationml.template",
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.template",
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
+ "application/x-ibooks+zip",
+ "application/x-itunes-ipa",
+ "application/x-tika-iworks-protected",
+ "application/x-tika-java-enterprise-archive",
+ "application/x-tika-java-web-archive",
+ "application/x-tika-ooxml",
+ "application/x-tika-ooxml-protected",
+ "application/x-tika-visio-ooxml",
+ "application/x-xmind",
+ "model/vnd.dwfx+xps",
+
+ "application/x-gtar" //specialization of tar
+ }) {
+ zipSpecializations.add(MediaType.parse(mediaTypeString));
+ }
+ return Collections.unmodifiableSet(zipSpecializations);
+ }
@Deprecated
static MediaType getMediaType(ArchiveInputStream stream) {
@@ -234,21 +308,21 @@ public class PackageParser extends AbstractParser {
}
//now see if the user or an earlier step has passed in a content type
- String incomingContentTypeString = metadata.get(CONTENT_TYPE);
+ String incomingContentTypeString = metadata.get(Metadata.CONTENT_TYPE);
if (incomingContentTypeString == null) {
- metadata.set(CONTENT_TYPE, type.toString());
+ metadata.set(Metadata.CONTENT_TYPE, type.toString());
return;
}
MediaType incomingMediaType = MediaType.parse(incomingContentTypeString);
if (incomingMediaType == null) {
- metadata.set(CONTENT_TYPE, type.toString());
+ metadata.set(Metadata.CONTENT_TYPE, type.toString());
return;
}
- if (! DONT_OVERWRITE_CONTENT_TYPE.contains(incomingMediaType)) {
- metadata.set(CONTENT_TYPE, type.toString());
+ if (! PACKAGE_SPECIALIZATIONS.contains(incomingMediaType)) {
+ metadata.set(Metadata.CONTENT_TYPE, type.toString());
}
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java
index 12b7bb8..743daee 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java
@@ -18,12 +18,18 @@
package org.apache.tika.parser.pkg;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import java.nio.charset.StandardCharsets;
+import java.util.HashSet;
+import java.util.Set;
import org.apache.commons.compress.archivers.ArchiveStreamFactory;
+import org.apache.tika.config.TikaConfig;
import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.parser.ParseContext;
import org.junit.Test;
@@ -50,4 +56,25 @@ public class PackageParserTest {
}
}
}
+
+ @Test
+ public void testSpecializations() throws Exception {
+ //Test that our manually constructed list of children of zip and tar
+ //in PackageParser is current with TikaConfig's defaultConfig.
+ TikaConfig config = TikaConfig.getDefaultConfig();
+ MediaTypeRegistry mediaTypeRegistry = config.getMimeRepository().getMediaTypeRegistry();
+ Set<MediaType> currentSpecializations = new HashSet<>();
+ MediaType tar = MediaType.parse("application/x-tar");
+ for (MediaType type : mediaTypeRegistry.getTypes()) {
+ if (mediaTypeRegistry.isSpecializationOf(type, MediaType.APPLICATION_ZIP)
+ || mediaTypeRegistry.isSpecializationOf(type, tar)) {
+ currentSpecializations.add(type);
+// System.out.println("\""+type.toString()+"\",");
+ }
+ }
+ for (MediaType mediaType : currentSpecializations) {
+ assertTrue("missing: "+mediaType, PackageParser.PACKAGE_SPECIALIZATIONS.contains(mediaType));
+ }
+ assertEquals(currentSpecializations.size(), PackageParser.PACKAGE_SPECIALIZATIONS.size());
+ }
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
index f9b7a66..be0ff9f 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
@@ -31,6 +31,7 @@ import org.apache.commons.compress.archivers.ArchiveStreamFactory;
import org.apache.tika.Tika;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.HttpHeaders;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
@@ -213,4 +214,17 @@ public class ZipParserTest extends AbstractPkgTest {
assertContains("hello world", metadataList.get(1).get(RecursiveParserWrapper.TIKA_CONTENT));
}
+
+ @Test
+ public void testKMZDetection() throws Exception {
+ List<Metadata> metadataList = getRecursiveMetadata("testKMZ.kmz");
+ assertEquals("application/vnd.google-earth.kmz", metadataList.get(0).get(HttpHeaders.CONTENT_TYPE));
+ }
+
+ @Test
+ public void testJARDetection() throws Exception {
+ List<Metadata> metadataList = getRecursiveMetadata("testJAR.jar");
+ assertEquals("application/java-archive", metadataList.get(0).get(HttpHeaders.CONTENT_TYPE));
+ }
+
}
--
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].