You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/09/05 14:19:17 UTC

svn commit: r1165259 - /tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java

Author: jukka
Date: Mon Sep  5 12:19:17 2011
New Revision: 1165259

URL: http://svn.apache.org/viewvc?rev=1165259&view=rev
Log:
TIKA-704: PDF and Outlook docs embedded in MS Word documents not parsed

Restore support for the embedded PDF in the TestWithPdf.docx document without breaking the new EmbeddedDocument.docx test case

Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java?rev=1165259&r1=1165258&r2=1165259&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java Mon Sep  5 12:19:17 2011
@@ -38,7 +38,7 @@ import org.apache.tika.extractor.Parsing
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.microsoft.OfficeParser;
+import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
 import org.apache.tika.sax.EmbeddedContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.apache.xmlbeans.XmlException;
@@ -152,9 +152,10 @@ public abstract class AbstractOOXMLExtra
             Metadata metadata = new Metadata();
             TikaInputStream stream = null;
 
-            OfficeParser.POIFSDocumentType dt = OfficeParser.POIFSDocumentType.detectType(fs);
-
-            if (dt.equals(OfficeParser.POIFSDocumentType.OLE10_NATIVE)) {
+            DirectoryNode root = fs.getRoot();
+            if (POIFSDocumentType.OLE10_NATIVE.equals(
+                    POIFSDocumentType.detectType(root))) {
+                // TIKA-704: OLE 1.0 embedded document
                 Ole10Native ole =
                         Ole10Native.createFromEmbeddedOleObject(fs);
                 metadata.set(Metadata.RESOURCE_NAME_KEY, ole.getLabel());
@@ -169,6 +170,18 @@ public abstract class AbstractOOXMLExtra
                             stream, new EmbeddedContentHandler(handler),
                             metadata, false);
                 }
+            } else if (root.hasEntry("CONTENTS")
+                    && root.hasEntry("\u0001Ole")
+                    && root.hasEntry("\u0001CompObj")
+                    && root.hasEntry("\u0003ObjInfo")) {
+                // TIKA-704: OLE 2.0 embedded non-Office document?
+                stream = TikaInputStream.get(
+                        fs.createDocumentInputStream("CONTENTS"));
+                if (embeddedExtractor.shouldParseEmbedded(metadata)) {
+                    embeddedExtractor.parseEmbedded(
+                            stream, new EmbeddedContentHandler(handler),
+                            metadata, false);
+                }
             } else {
                 handleEmbeddedFile(part, handler);
             }