You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2011/09/05 19:04:05 UTC
svn commit: r1165363 - in
/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft:
POIFSContainerDetector.java ooxml/AbstractOOXMLExtractor.java
Author: nick
Date: Mon Sep 5 17:04:05 2011
New Revision: 1165363
URL: http://svn.apache.org/viewvc?rev=1165363&view=rev
Log:
TIKA-704 Tweak detection of embedded non-office documents in OLE2 streams
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java?rev=1165363&r1=1165362&r2=1165363&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java Mon Sep 5 17:04:05 2011
@@ -106,8 +106,13 @@ public class POIFSContainerDetector impl
return PPT;
} else if (names.contains("VisioDocument")) {
return VSD;
+ } else if (names.contains("CONTENTS") && names.contains("SPELLING")) {
+ return WPS;
} else if (names.contains("CONTENTS")) {
- return WPS;
+ // CONTENTS without SPELLING normally means some sort of
+ // embedded non-office file inside an OLE2 document
+ // This is most commonly triggered on nested directories
+ return OLE;
} else if (names.contains("\u0001Ole10Native")) {
return OLE;
} else if (names.contains("PerfectOffice_MAIN")) {
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java?rev=1165363&r1=1165362&r2=1165363&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java Mon Sep 5 17:04:05 2011
@@ -153,8 +153,21 @@ public abstract class AbstractOOXMLExtra
TikaInputStream stream = null;
DirectoryNode root = fs.getRoot();
- if (POIFSDocumentType.OLE10_NATIVE.equals(
- POIFSDocumentType.detectType(root))) {
+ POIFSDocumentType type = POIFSDocumentType.detectType(root);
+
+ if (root.hasEntry("CONTENTS")
+ && root.hasEntry("\u0001Ole")
+ && root.hasEntry("\u0001CompObj")
+ && root.hasEntry("\u0003ObjInfo")) {
+ // TIKA-704: OLE 2.0 embedded non-Office document?
+ stream = TikaInputStream.get(
+ fs.createDocumentInputStream("CONTENTS"));
+ if (embeddedExtractor.shouldParseEmbedded(metadata)) {
+ embeddedExtractor.parseEmbedded(
+ stream, new EmbeddedContentHandler(handler),
+ metadata, false);
+ }
+ } else if (POIFSDocumentType.OLE10_NATIVE == type) {
// TIKA-704: OLE 1.0 embedded document
Ole10Native ole =
Ole10Native.createFromEmbeddedOleObject(fs);
@@ -170,18 +183,6 @@ public abstract class AbstractOOXMLExtra
stream, new EmbeddedContentHandler(handler),
metadata, false);
}
- } else if (root.hasEntry("CONTENTS")
- && root.hasEntry("\u0001Ole")
- && root.hasEntry("\u0001CompObj")
- && root.hasEntry("\u0003ObjInfo")) {
- // TIKA-704: OLE 2.0 embedded non-Office document?
- stream = TikaInputStream.get(
- fs.createDocumentInputStream("CONTENTS"));
- if (embeddedExtractor.shouldParseEmbedded(metadata)) {
- embeddedExtractor.parseEmbedded(
- stream, new EmbeddedContentHandler(handler),
- metadata, false);
- }
} else {
handleEmbeddedFile(part, handler);
}