You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/04/13 18:02:22 UTC

[tika] branch 2.x updated: TIKA-2311 -- maintain x-tika-ooxml mime type for truncated ooxml

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/tika.git

The following commit(s) were added to refs/heads/2.x by this push:
       new  143efc8   TIKA-2311 -- maintain x-tika-ooxml mime type for truncated ooxml
143efc8 is described below

commit 143efc8d92735099f5077956d8f257aad106321a
Author: tballison <ta...@mitre.org>
AuthorDate: Thu Apr 13 14:02:12 2017 -0400

    TIKA-2311 -- maintain x-tika-ooxml mime type for truncated ooxml
---
 .../org/apache/tika/parser/pkg/PackageTest.java    |   2 +-
 .../parser/microsoft/ooxml/OOXMLParserTest.java    |  18 +++++++
 .../org/apache/tika/parser/pkg/PackageParser.java  |  60 +++++++++++++++++++--
 .../org/apache/tika/parser/pkg/TarParserTest.java  |   2 +-
 .../test-documents/testWORD_truncated.docx         | Bin 0 -> 763 bytes
 5 files changed, 75 insertions(+), 7 deletions(-)

diff --git a/tika-app/src/test/java/org/apache/tika/parser/pkg/PackageTest.java b/tika-app/src/test/java/org/apache/tika/parser/pkg/PackageTest.java
index c47a348..fc11828 100644
--- a/tika-app/src/test/java/org/apache/tika/parser/pkg/PackageTest.java
+++ b/tika-app/src/test/java/org/apache/tika/parser/pkg/PackageTest.java
@@ -262,7 +262,7 @@ public class PackageTest extends TikaTest {
             parser.parse(stream, handler, metadata, recursingContext);
         }
 
-        assertEquals("application/x-tar", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("application/x-gtar", metadata.get(Metadata.CONTENT_TYPE));
         String content = handler.toString();
         assertContains("test-documents/testEXCEL.xls", content);
         assertContains("Sample Excel Worksheet", content);
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index f555617..847c0b0 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -27,6 +27,7 @@ import javax.xml.transform.sax.SAXTransformerFactory;
 import javax.xml.transform.sax.TransformerHandler;
 import javax.xml.transform.stream.StreamResult;
 import java.io.ByteArrayOutputStream;
+import java.io.EOFException;
 import java.io.File;
 import java.io.InputStream;
 import java.io.PrintStream;
@@ -43,6 +44,7 @@ import org.apache.poi.util.LocaleUtil;
 import org.apache.tika.TikaTest;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Office;
@@ -62,6 +64,7 @@ import org.apache.tika.sax.BodyContentHandler;
 import org.junit.Ignore;
 import org.junit.Test;
 import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.DefaultHandler;
 
 public class OOXMLParserTest extends TikaTest {
 
@@ -1411,6 +1414,21 @@ public class OOXMLParserTest extends TikaTest {
 
     }
 
+    @Test
+    public void testTruncated() throws Exception {
+        Parser p = new AutoDetectParser();
+        ContentHandler handler = new DefaultHandler();
+        Metadata metadata = new Metadata();
+        ParseContext parseContext = new ParseContext();
+        try (InputStream is = getTestDocument("testWORD_truncated.docx")) {
+            p.parse(is, handler, metadata, parseContext);
+            fail("should have thrown an EOF exception?!");
+        } catch (TikaException e) {
+            Throwable cause = e.getCause();
+            assertTrue(cause instanceof EOFException);
+            assertEquals("application/x-tika-ooxml", metadata.get(Metadata.CONTENT_TYPE));
+        }
+    }
 }
 
 
diff --git a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
index e636537..734de63 100644
--- a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
+++ b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
@@ -41,6 +41,7 @@ import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException.
 import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
 import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
 import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.config.TikaConfig;
 import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
@@ -50,6 +51,7 @@ import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MediaTypeRegistry;
 import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.PasswordProvider;
@@ -85,6 +87,10 @@ public class PackageParser extends AbstractParser {
     private static final Set<MediaType> SUPPORTED_TYPES =
             MediaType.set(ZIP, JAR, AR, CPIO, DUMP, TAR, SEVENZ);
 
+    //this can't be static because of the ForkParser
+    //lazily load this when parse is called if it is null.
+    private MediaTypeRegistry bufferedMediaTypeRegistry;
+    private final Object lock = new Object[0];
     static MediaType getMediaType(ArchiveInputStream stream) {
         if (stream instanceof JarArchiveInputStream) {
             return JAR;
@@ -117,7 +123,26 @@ public class PackageParser extends AbstractParser {
             InputStream stream, ContentHandler handler,
             Metadata metadata, ParseContext context)
             throws IOException, SAXException, TikaException {
-       
+
+        //lazily load the MediaTypeRegistry at parse time
+        //only want to call getDefaultConfig() once, and can't
+        //load statically because of the ForkParser
+        TikaConfig config = context.get(TikaConfig.class);
+        MediaTypeRegistry mediaTypeRegistry = null;
+        if (config != null) {
+            mediaTypeRegistry = config.getMediaTypeRegistry();
+        } else {
+            if (bufferedMediaTypeRegistry == null) {
+                //buffer this for next time.
+                synchronized (lock) {
+                    //now that we're locked, check again
+                    if (bufferedMediaTypeRegistry == null) {
+                        bufferedMediaTypeRegistry = TikaConfig.getDefaultConfig().getMediaTypeRegistry();
+                    }
+                }
+            }
+            mediaTypeRegistry = bufferedMediaTypeRegistry;
+        }
         // Ensure that the stream supports the mark feature
         if (! stream.markSupported()) {
             stream = new BufferedInputStream(stream);
@@ -165,10 +190,7 @@ public class PackageParser extends AbstractParser {
             throw new TikaException("Unable to unpack document stream", e);
         }
 
-        MediaType type = getMediaType(ais);
-        if (!type.equals(MediaType.OCTET_STREAM)) {
-            metadata.set(CONTENT_TYPE, type.toString());
-        }
+        updateMediaType(ais, mediaTypeRegistry, metadata);
         // Use the delegate parser to parse the contained document
         EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
 
@@ -200,6 +222,34 @@ public class PackageParser extends AbstractParser {
         xhtml.endDocument();
     }
 
+    private void updateMediaType(ArchiveInputStream ais, MediaTypeRegistry mediaTypeRegistry, Metadata metadata) {
+        MediaType type = getMediaType(ais);
+        if (type.equals(MediaType.OCTET_STREAM)) {
+            return;
+        }
+
+        //now see if the user or an earlier step has passed in a content type
+        String incomingContentTypeString = metadata.get(CONTENT_TYPE);
+        if (incomingContentTypeString == null) {
+            metadata.set(CONTENT_TYPE, type.toString());
+            return;
+        }
+
+
+        MediaType incomingMediaType = MediaType.parse(incomingContentTypeString);
+        if (incomingMediaType == null) {
+            metadata.set(CONTENT_TYPE, type.toString());
+            return;
+        }
+        //if the existing type is a specialization of the detected type,
+        //leave in the specialization; otherwise set the detected
+        if (! mediaTypeRegistry.isSpecializationOf(incomingMediaType, type)) {
+            metadata.set(CONTENT_TYPE, type.toString());
+            return;
+        }
+
+    }
+
     private void parseEntry(
             ArchiveInputStream archive, ArchiveEntry entry,
             EmbeddedDocumentExtractor extractor, Metadata parentMetadata, XHTMLContentHandler xhtml)
diff --git a/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java b/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java
index 35ab265..f47c9fc 100644
--- a/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java
+++ b/tika-parser-modules/tika-parser-package-module/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java
@@ -46,7 +46,7 @@ public class TarParserTest extends AbstractPkgTest {
             parser.parse(stream, handler, metadata, recursingContext);
         }
 
-        assertEquals("application/x-tar", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("application/x-gtar", metadata.get(Metadata.CONTENT_TYPE));
         String content = handler.toString();
         assertContains("test-documents/testEXCEL.xls", content);
         assertContains("test-documents/testHTML.html", content);
diff --git a/tika-test-resources/src/test/resources/test-documents/testWORD_truncated.docx b/tika-test-resources/src/test/resources/test-documents/testWORD_truncated.docx
new file mode 100644
index 0000000..ebe5e1a
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testWORD_truncated.docx differ

-- 
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].