You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2011/09/05 19:04:05 UTC

svn commit: r1165363 - in /tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft: POIFSContainerDetector.java ooxml/AbstractOOXMLExtractor.java

Author: nick
Date: Mon Sep  5 17:04:05 2011
New Revision: 1165363

URL: http://svn.apache.org/viewvc?rev=1165363&view=rev
Log:
TIKA-704 Tweak detection of embedded non-office documents in OLE2 streams

Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java?rev=1165363&r1=1165362&r2=1165363&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java Mon Sep  5 17:04:05 2011
@@ -106,8 +106,13 @@ public class POIFSContainerDetector impl
                 return PPT;
             } else if (names.contains("VisioDocument")) {
                 return VSD;
+            } else if (names.contains("CONTENTS") && names.contains("SPELLING")) {
+               return WPS;
             } else if (names.contains("CONTENTS")) {
-                return WPS;
+               // CONTENTS without SPELLING normally means some sort of
+               //  embedded non-office file inside an OLE2 document
+               // This is most commonly triggered on nested directories
+               return OLE;
             } else if (names.contains("\u0001Ole10Native")) {
                 return OLE;
             } else if (names.contains("PerfectOffice_MAIN")) {

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java?rev=1165363&r1=1165362&r2=1165363&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java Mon Sep  5 17:04:05 2011
@@ -153,8 +153,21 @@ public abstract class AbstractOOXMLExtra
             TikaInputStream stream = null;
 
             DirectoryNode root = fs.getRoot();
-            if (POIFSDocumentType.OLE10_NATIVE.equals(
-                    POIFSDocumentType.detectType(root))) {
+            POIFSDocumentType type = POIFSDocumentType.detectType(root);
+            
+            if (root.hasEntry("CONTENTS")
+                  && root.hasEntry("\u0001Ole")
+                  && root.hasEntry("\u0001CompObj")
+                  && root.hasEntry("\u0003ObjInfo")) {
+               // TIKA-704: OLE 2.0 embedded non-Office document?
+               stream = TikaInputStream.get(
+                     fs.createDocumentInputStream("CONTENTS"));
+               if (embeddedExtractor.shouldParseEmbedded(metadata)) {
+                  embeddedExtractor.parseEmbedded(
+                        stream, new EmbeddedContentHandler(handler),
+                        metadata, false);
+               }
+            } else if (POIFSDocumentType.OLE10_NATIVE == type) {
                 // TIKA-704: OLE 1.0 embedded document
                 Ole10Native ole =
                         Ole10Native.createFromEmbeddedOleObject(fs);
@@ -170,18 +183,6 @@ public abstract class AbstractOOXMLExtra
                             stream, new EmbeddedContentHandler(handler),
                             metadata, false);
                 }
-            } else if (root.hasEntry("CONTENTS")
-                    && root.hasEntry("\u0001Ole")
-                    && root.hasEntry("\u0001CompObj")
-                    && root.hasEntry("\u0003ObjInfo")) {
-                // TIKA-704: OLE 2.0 embedded non-Office document?
-                stream = TikaInputStream.get(
-                        fs.createDocumentInputStream("CONTENTS"));
-                if (embeddedExtractor.shouldParseEmbedded(metadata)) {
-                    embeddedExtractor.parseEmbedded(
-                            stream, new EmbeddedContentHandler(handler),
-                            metadata, false);
-                }
             } else {
                 handleEmbeddedFile(part, handler);
             }