You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/02/26 20:24:19 UTC

[tika] branch branch_1x updated: TIKA-2580

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_1x by this push:
     new 520e73f  TIKA-2580
520e73f is described below

commit 520e73f905349f097571f5bb8e9de67870ca2533
Author: tballison <ta...@mitre.org>
AuthorDate: Mon Feb 26 15:20:30 2018 -0500

    TIKA-2580
---
 CHANGES.txt                                        |   3 +++
 .../tika/parser/microsoft/HSLFExtractor.java       |   3 ++-
 .../microsoft/ooxml/AbstractOOXMLExtractor.java    |  23 ++++++++++++++++-----
 .../parser/microsoft/PowerPointParserTest.java     |  13 ++++++++++++
 .../parser/microsoft/ooxml/OOXMLParserTest.java    |  11 ++++++++++
 .../test-documents/testPPT_oleWorkbook.ppt         | Bin 0 -> 98304 bytes
 .../test-documents/testPPT_oleWorkbook.pptx        | Bin 0 -> 44001 bytes
 7 files changed, 47 insertions(+), 6 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 0185d44..eb5fa7d 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
 Release 1.18 - ???
 
+   * Extract xlsx files embedded in OLE objects within PPT and PPTX
+     via Brian McColgan (TIKA-2588).
+
    * Extract files embedded in HTML and javascript inside HTML
      that are stored in the Data URI scheme (TIKA-2563).
 
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
index d0a1abe..02665e5 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
@@ -487,7 +487,8 @@ public class HSLFExtractor extends AbstractPOIFSExtractor {
                             MediaType mt = getTikaConfig().getDetector().detect(stream, new Metadata());
                             mediaType = mt.toString();
                         }
-                        if (mediaType.equals("application/x-tika-msoffice-embedded; format=comp_obj")) {
+                        if (mediaType.equals("application/x-tika-msoffice-embedded; format=comp_obj")
+                                || mediaType.equals("application/x-tika-msoffice")) {
                             try(NPOIFSFileSystem npoifs = new NPOIFSFileSystem(new CloseShieldInputStream(stream))) {
                                 handleEmbeddedOfficeDoc(npoifs.getRoot(), objID, xhtml);
                             }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index 2560db0..288e99c 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -25,6 +25,7 @@ import java.io.InputStream;
 import java.net.URI;
 import java.util.HashMap;
 import java.util.HashSet;
+import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
@@ -40,6 +41,8 @@ import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
 import org.apache.poi.openxml4j.opc.TargetMode;
 import org.apache.poi.openxml4j.opc.internal.FileHelper;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
 import org.apache.poi.poifs.filesystem.Ole10Native;
 import org.apache.poi.poifs.filesystem.Ole10NativeException;
@@ -299,15 +302,25 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
             DirectoryNode root = fs.getRoot();
             POIFSDocumentType type = POIFSDocumentType.detectType(root);
 
-            if (root.hasEntry("CONTENTS")
-                    && root.hasEntry("\u0001Ole")
-                    && root.hasEntry("\u0001CompObj")) {
+            if (root.hasEntry("\u0001Ole")
+                    && root.hasEntry("\u0001CompObj")
+                    && (
+                            root.hasEntry("CONTENTS") || root.hasEntry("Package")
+                    )) {
                 // TIKA-704: OLE 2.0 embedded non-Office document?
                 //TODO: figure out if the equivalent of OLE 1.0's
                 //getCommand() and getFileName() exist for OLE 2.0 to populate
                 //TikaCoreProperties.ORIGINAL_RESOURCE_NAME
-                stream = TikaInputStream.get(
-                        fs.createDocumentInputStream("CONTENTS"));
+                if (root.hasEntry("CONTENTS")) {
+                    stream = TikaInputStream.get(
+                            fs.createDocumentInputStream("CONTENTS"));
+                } else if (root.hasEntry("Package")) {
+                    //TIKA-2588
+                    stream = TikaInputStream.get(
+                            fs.createDocumentInputStream("Package"));
+                } else {
+                    throw new IllegalStateException("Shouldn't ever arrive here; please open a ticket on our jira");
+                }
                 if (embeddedExtractor.shouldParseEmbedded(metadata)) {
                     embeddedExtractor.parseEmbedded(
                             stream, new EmbeddedContentHandler(handler),
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
index e4e4332..f217ef0 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
@@ -342,4 +342,17 @@ public class PowerPointParserTest extends TikaTest {
         String content = getXML("testPPT_groups.ppt").xml;
         assertContains("href=\"http://tika.apache.org", content);
     }
+
+    @Test
+    public void testEmbeddedXLSInOLEObject() throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata("testPPT_oleWorkbook.ppt");
+        debug(metadataList);
+        assertEquals(3, metadataList.size());
+        Metadata xlsx = metadataList.get(1);
+        assertContains("<h1>Sheet1</h1>", xlsx.get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertContains("<td>1</td>", xlsx.get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+                xlsx.get(Metadata.CONTENT_TYPE));
+
+    }
 }
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 3cd5c65..f0c7075 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -1816,6 +1816,17 @@ public class OOXMLParserTest extends TikaTest {
         assertEquals("image/png", metadataList.get(2).get(Metadata.CONTENT_TYPE));
         assertEquals("image/jpeg", metadataList.get(3).get(Metadata.CONTENT_TYPE));
     }
+
+    @Test
+    public void testEmbeddedXLSInOLEObject() throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata("testPPT_oleWorkbook.pptx");
+        assertEquals(4, metadataList.size());
+        Metadata xlsx = metadataList.get(2);
+        assertContains("<h1>Sheet1</h1>", xlsx.get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertContains("<td>1</td>", xlsx.get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+                xlsx.get(Metadata.CONTENT_TYPE));
+    }
 }
 
 
diff --git a/tika-parsers/src/test/resources/test-documents/testPPT_oleWorkbook.ppt b/tika-parsers/src/test/resources/test-documents/testPPT_oleWorkbook.ppt
new file mode 100644
index 0000000..b0c9cc8
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testPPT_oleWorkbook.ppt differ
diff --git a/tika-parsers/src/test/resources/test-documents/testPPT_oleWorkbook.pptx b/tika-parsers/src/test/resources/test-documents/testPPT_oleWorkbook.pptx
new file mode 100644
index 0000000..5d7096a
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testPPT_oleWorkbook.pptx differ

-- 
To stop receiving notification emails like this one, please contact
tallison@apache.org.