You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/09/05 14:19:17 UTC
svn commit: r1165259 -
/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
Author: jukka
Date: Mon Sep 5 12:19:17 2011
New Revision: 1165259
URL: http://svn.apache.org/viewvc?rev=1165259&view=rev
Log:
TIKA-704: PDF and Outlook docs embedded in MS Word documents not parsed
Restore support for the embedded PDF in the TestWithPdf.docx document without breaking the new EmbeddedDocument.docx test case
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java?rev=1165259&r1=1165258&r2=1165259&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java Mon Sep 5 12:19:17 2011
@@ -38,7 +38,7 @@ import org.apache.tika.extractor.Parsing
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.microsoft.OfficeParser;
+import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.xmlbeans.XmlException;
@@ -152,9 +152,10 @@ public abstract class AbstractOOXMLExtra
Metadata metadata = new Metadata();
TikaInputStream stream = null;
- OfficeParser.POIFSDocumentType dt = OfficeParser.POIFSDocumentType.detectType(fs);
-
- if (dt.equals(OfficeParser.POIFSDocumentType.OLE10_NATIVE)) {
+ DirectoryNode root = fs.getRoot();
+ if (POIFSDocumentType.OLE10_NATIVE.equals(
+ POIFSDocumentType.detectType(root))) {
+ // TIKA-704: OLE 1.0 embedded document
Ole10Native ole =
Ole10Native.createFromEmbeddedOleObject(fs);
metadata.set(Metadata.RESOURCE_NAME_KEY, ole.getLabel());
@@ -169,6 +170,18 @@ public abstract class AbstractOOXMLExtra
stream, new EmbeddedContentHandler(handler),
metadata, false);
}
+ } else if (root.hasEntry("CONTENTS")
+ && root.hasEntry("\u0001Ole")
+ && root.hasEntry("\u0001CompObj")
+ && root.hasEntry("\u0003ObjInfo")) {
+ // TIKA-704: OLE 2.0 embedded non-Office document?
+ stream = TikaInputStream.get(
+ fs.createDocumentInputStream("CONTENTS"));
+ if (embeddedExtractor.shouldParseEmbedded(metadata)) {
+ embeddedExtractor.parseEmbedded(
+ stream, new EmbeddedContentHandler(handler),
+ metadata, false);
+ }
} else {
handleEmbeddedFile(part, handler);
}