You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/02/26 21:07:57 UTC
[tika] branch branch_1x updated: TIKA-2588 -- extract xlsx stored
within ole objects in ppt/x via Brian McColgan
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_1x by this push:
new 71cf654 TIKA-2588 -- extract xlsx stored within ole objects in ppt/x via Brian McColgan
new 99f4852 Merge branch 'branch_1x' of https://github.com/apache/tika into branch_1x
71cf654 is described below
commit 71cf6548c5ca88bf2e1debdb24f06bddd4f03690
Author: tballison <ta...@mitre.org>
AuthorDate: Mon Feb 26 15:20:30 2018 -0500
TIKA-2588 -- extract xlsx stored within ole objects in ppt/x via Brian McColgan
---
CHANGES.txt | 3 +++
.../tika/parser/microsoft/HSLFExtractor.java | 3 ++-
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 23 ++++++++++++++++-----
.../parser/microsoft/PowerPointParserTest.java | 13 ++++++++++++
.../parser/microsoft/ooxml/OOXMLParserTest.java | 11 ++++++++++
.../test-documents/testPPT_oleWorkbook.ppt | Bin 0 -> 98304 bytes
.../test-documents/testPPT_oleWorkbook.pptx | Bin 0 -> 44001 bytes
7 files changed, 47 insertions(+), 6 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 0185d44..eb5fa7d 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
Release 1.18 - ???
+ * Extract xlsx files embedded in OLE objects within PPT and PPTX
+ via Brian McColgan (TIKA-2588).
+
* Extract files embedded in HTML and javascript inside HTML
that are stored in the Data URI scheme (TIKA-2563).
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
index d0a1abe..02665e5 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
@@ -487,7 +487,8 @@ public class HSLFExtractor extends AbstractPOIFSExtractor {
MediaType mt = getTikaConfig().getDetector().detect(stream, new Metadata());
mediaType = mt.toString();
}
- if (mediaType.equals("application/x-tika-msoffice-embedded; format=comp_obj")) {
+ if (mediaType.equals("application/x-tika-msoffice-embedded; format=comp_obj")
+ || mediaType.equals("application/x-tika-msoffice")) {
try(NPOIFSFileSystem npoifs = new NPOIFSFileSystem(new CloseShieldInputStream(stream))) {
handleEmbeddedOfficeDoc(npoifs.getRoot(), objID, xhtml);
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index 2560db0..288e99c 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -25,6 +25,7 @@ import java.io.InputStream;
import java.net.URI;
import java.util.HashMap;
import java.util.HashSet;
+import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
@@ -40,6 +41,8 @@ import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
import org.apache.poi.openxml4j.opc.TargetMode;
import org.apache.poi.openxml4j.opc.internal.FileHelper;
import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.Ole10Native;
import org.apache.poi.poifs.filesystem.Ole10NativeException;
@@ -299,15 +302,25 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
DirectoryNode root = fs.getRoot();
POIFSDocumentType type = POIFSDocumentType.detectType(root);
- if (root.hasEntry("CONTENTS")
- && root.hasEntry("\u0001Ole")
- && root.hasEntry("\u0001CompObj")) {
+ if (root.hasEntry("\u0001Ole")
+ && root.hasEntry("\u0001CompObj")
+ && (
+ root.hasEntry("CONTENTS") || root.hasEntry("Package")
+ )) {
// TIKA-704: OLE 2.0 embedded non-Office document?
//TODO: figure out if the equivalent of OLE 1.0's
//getCommand() and getFileName() exist for OLE 2.0 to populate
//TikaCoreProperties.ORIGINAL_RESOURCE_NAME
- stream = TikaInputStream.get(
- fs.createDocumentInputStream("CONTENTS"));
+ if (root.hasEntry("CONTENTS")) {
+ stream = TikaInputStream.get(
+ fs.createDocumentInputStream("CONTENTS"));
+ } else if (root.hasEntry("Package")) {
+ //TIKA-2588
+ stream = TikaInputStream.get(
+ fs.createDocumentInputStream("Package"));
+ } else {
+ throw new IllegalStateException("Shouldn't ever arrive here; please open a ticket on our jira");
+ }
if (embeddedExtractor.shouldParseEmbedded(metadata)) {
embeddedExtractor.parseEmbedded(
stream, new EmbeddedContentHandler(handler),
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
index e4e4332..f217ef0 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
@@ -342,4 +342,17 @@ public class PowerPointParserTest extends TikaTest {
String content = getXML("testPPT_groups.ppt").xml;
assertContains("href=\"http://tika.apache.org", content);
}
+
+ @Test
+ public void testEmbeddedXLSInOLEObject() throws Exception {
+ List<Metadata> metadataList = getRecursiveMetadata("testPPT_oleWorkbook.ppt");
+ debug(metadataList);
+ assertEquals(3, metadataList.size());
+ Metadata xlsx = metadataList.get(1);
+ assertContains("<h1>Sheet1</h1>", xlsx.get(RecursiveParserWrapper.TIKA_CONTENT));
+ assertContains("<td>1</td>", xlsx.get(RecursiveParserWrapper.TIKA_CONTENT));
+ assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+ xlsx.get(Metadata.CONTENT_TYPE));
+
+ }
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 3cd5c65..f0c7075 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -1816,6 +1816,17 @@ public class OOXMLParserTest extends TikaTest {
assertEquals("image/png", metadataList.get(2).get(Metadata.CONTENT_TYPE));
assertEquals("image/jpeg", metadataList.get(3).get(Metadata.CONTENT_TYPE));
}
+
+ @Test
+ public void testEmbeddedXLSInOLEObject() throws Exception {
+ List<Metadata> metadataList = getRecursiveMetadata("testPPT_oleWorkbook.pptx");
+ assertEquals(4, metadataList.size());
+ Metadata xlsx = metadataList.get(2);
+ assertContains("<h1>Sheet1</h1>", xlsx.get(RecursiveParserWrapper.TIKA_CONTENT));
+ assertContains("<td>1</td>", xlsx.get(RecursiveParserWrapper.TIKA_CONTENT));
+ assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+ xlsx.get(Metadata.CONTENT_TYPE));
+ }
}
diff --git a/tika-parsers/src/test/resources/test-documents/testPPT_oleWorkbook.ppt b/tika-parsers/src/test/resources/test-documents/testPPT_oleWorkbook.ppt
new file mode 100644
index 0000000..b0c9cc8
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testPPT_oleWorkbook.ppt differ
diff --git a/tika-parsers/src/test/resources/test-documents/testPPT_oleWorkbook.pptx b/tika-parsers/src/test/resources/test-documents/testPPT_oleWorkbook.pptx
new file mode 100644
index 0000000..5d7096a
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testPPT_oleWorkbook.pptx differ
--
To stop receiving notification emails like this one, please contact
tallison@apache.org.