You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/04/13 18:02:22 UTC
[tika] branch 2.x updated: TIKA-2311 -- maintain x-tika-ooxml mime
type for truncated ooxml
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/2.x by this push:
new 143efc8 TIKA-2311 -- maintain x-tika-ooxml mime type for truncated ooxml
143efc8 is described below
commit 143efc8d92735099f5077956d8f257aad106321a
Author: tballison <ta...@mitre.org>
AuthorDate: Thu Apr 13 14:02:12 2017 -0400
TIKA-2311 -- maintain x-tika-ooxml mime type for truncated ooxml
---
.../org/apache/tika/parser/pkg/PackageTest.java | 2 +-
.../parser/microsoft/ooxml/OOXMLParserTest.java | 18 +++++++
.../org/apache/tika/parser/pkg/PackageParser.java | 60 +++++++++++++++++++--
.../org/apache/tika/parser/pkg/TarParserTest.java | 2 +-
.../test-documents/testWORD_truncated.docx | Bin 0 -> 763 bytes
5 files changed, 75 insertions(+), 7 deletions(-)
diff --git a/tika-app/src/test/java/org/apache/tika/parser/pkg/PackageTest.java b/tika-app/src/test/java/org/apache/tika/parser/pkg/PackageTest.java
index c47a348..fc11828 100644
--- a/tika-app/src/test/java/org/apache/tika/parser/pkg/PackageTest.java
+++ b/tika-app/src/test/java/org/apache/tika/parser/pkg/PackageTest.java
@@ -262,7 +262,7 @@ public class PackageTest extends TikaTest {
parser.parse(stream, handler, metadata, recursingContext);
}
- assertEquals("application/x-tar", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("application/x-gtar", metadata.get(Metadata.CONTENT_TYPE));
String content = handler.toString();
assertContains("test-documents/testEXCEL.xls", content);
assertContains("Sample Excel Worksheet", content);
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index f555617..847c0b0 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -27,6 +27,7 @@ import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;
import java.io.ByteArrayOutputStream;
+import java.io.EOFException;
import java.io.File;
import java.io.InputStream;
import java.io.PrintStream;
@@ -43,6 +44,7 @@ import org.apache.poi.util.LocaleUtil;
import org.apache.tika.TikaTest;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
@@ -62,6 +64,7 @@ import org.apache.tika.sax.BodyContentHandler;
import org.junit.Ignore;
import org.junit.Test;
import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.DefaultHandler;
public class OOXMLParserTest extends TikaTest {
@@ -1411,6 +1414,21 @@ public class OOXMLParserTest extends TikaTest {
}
+ @Test
+ public void testTruncated() throws Exception {
+ Parser p = new AutoDetectParser();
+ ContentHandler handler = new DefaultHandler();
+ Metadata metadata = new Metadata();
+ ParseContext parseContext = new ParseContext();
+ try (InputStream is = getTestDocument("testWORD_truncated.docx")) {
+ p.parse(is, handler, metadata, parseContext);
+ fail("should have thrown an EOF exception?!");
+ } catch (TikaException e) {
+ Throwable cause = e.getCause();
+ assertTrue(cause instanceof EOFException);
+ assertEquals("application/x-tika-ooxml", metadata.get(Metadata.CONTENT_TYPE));
+ }
+ }
}
diff --git a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
index e636537..734de63 100644
--- a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
+++ b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
@@ -41,6 +41,7 @@ import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException.
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
@@ -50,6 +51,7 @@ import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.PasswordProvider;
@@ -85,6 +87,10 @@ public class PackageParser extends AbstractParser {
private static final Set<MediaType> SUPPORTED_TYPES =
MediaType.set(ZIP, JAR, AR, CPIO, DUMP, TAR, SEVENZ);
+ //this can't be static because of the ForkParser
+ //lazily load this when parse is called if it is null.
+ private MediaTypeRegistry bufferedMediaTypeRegistry;
+ private final Object lock = new Object[0];
static MediaType getMediaType(ArchiveInputStream stream) {
if (stream instanceof JarArchiveInputStream) {
return JAR;
@@ -117,7 +123,26 @@ public class PackageParser extends AbstractParser {
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
-
+
+ //lazily load the MediaTypeRegistry at parse time
+ //only want to call getDefaultConfig() once, and can't
+ //load statically because of the ForkParser
+ TikaConfig config = context.get(TikaConfig.class);
+ MediaTypeRegistry mediaTypeRegistry = null;
+ if (config != null) {
+ mediaTypeRegistry = config.getMediaTypeRegistry();
+ } else {
+ if (bufferedMediaTypeRegistry == null) {
+ //buffer this for next time.
+ synchronized (lock) {
+ //now that we're locked, check again
+ if (bufferedMediaTypeRegistry == null) {
+ bufferedMediaTypeRegistry = TikaConfig.getDefaultConfig().getMediaTypeRegistry();
+ }
+ }
+ }
+ mediaTypeRegistry = bufferedMediaTypeRegistry;
+ }
// Ensure that the stream supports the mark feature
if (! stream.markSupported()) {
stream = new BufferedInputStream(stream);
@@ -165,10 +190,7 @@ public class PackageParser extends AbstractParser {
throw new TikaException("Unable to unpack document stream", e);
}
- MediaType type = getMediaType(ais);
- if (!type.equals(MediaType.OCTET_STREAM)) {
- metadata.set(CONTENT_TYPE, type.toString());
- }
+ updateMediaType(ais, mediaTypeRegistry, metadata);
// Use the delegate parser to parse the contained document
EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
@@ -200,6 +222,34 @@ public class PackageParser extends AbstractParser {
xhtml.endDocument();
}
+ private void updateMediaType(ArchiveInputStream ais, MediaTypeRegistry mediaTypeRegistry, Metadata metadata) {
+ MediaType type = getMediaType(ais);
+ if (type.equals(MediaType.OCTET_STREAM)) {
+ return;
+ }
+
+ //now see if the user or an earlier step has passed in a content type
+ String incomingContentTypeString = metadata.get(CONTENT_TYPE);
+ if (incomingContentTypeString == null) {
+ metadata.set(CONTENT_TYPE, type.toString());
+ return;
+ }
+
+
+ MediaType incomingMediaType = MediaType.parse(incomingContentTypeString);
+ if (incomingMediaType == null) {
+ metadata.set(CONTENT_TYPE, type.toString());
+ return;
+ }
+ //if the existing type is a specialization of the detected type,
+ //leave in the specialization; otherwise set the detected
+ if (! mediaTypeRegistry.isSpecializationOf(incomingMediaType, type)) {
+ metadata.set(CONTENT_TYPE, type.toString());
+ return;
+ }
+
+ }
+
private void parseEntry(
ArchiveInputStream archive, ArchiveEntry entry,
EmbeddedDocumentExtractor extractor, Metadata parentMetadata, XHTMLContentHandler xhtml)
diff --git a/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java b/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java
index 35ab265..f47c9fc 100644
--- a/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java
+++ b/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java
@@ -46,7 +46,7 @@ public class TarParserTest extends AbstractPkgTest {
parser.parse(stream, handler, metadata, recursingContext);
}
- assertEquals("application/x-tar", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("application/x-gtar", metadata.get(Metadata.CONTENT_TYPE));
String content = handler.toString();
assertContains("test-documents/testEXCEL.xls", content);
assertContains("test-documents/testHTML.html", content);
diff --git a/tika-test-resources/src/test/resources/test-documents/testWORD_truncated.docx b/tika-test-resources/src/test/resources/test-documents/testWORD_truncated.docx
new file mode 100644
index 0000000..ebe5e1a
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testWORD_truncated.docx differ
--
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].