You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/28 16:54:48 UTC
tika git commit: TIKA-2026 -- improve extraction of embedded files
from ppt, pptx and xlsx
Repository: tika
Updated Branches:
refs/heads/master 69d825005 -> 7cc610e1b
TIKA-2026 -- improve extraction of embedded files from ppt, pptx and xlsx
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/7cc610e1
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/7cc610e1
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/7cc610e1
Branch: refs/heads/master
Commit: 7cc610e1b3f164fe9de00b1a35e60fd00a69bb46
Parents: 69d8250
Author: tballison <ta...@mitre.org>
Authored: Tue Jun 28 12:54:33 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Tue Jun 28 12:54:33 2016 -0400
----------------------------------------------------------------------
CHANGES.txt | 5 ++++-
.../microsoft/AbstractPOIFSExtractor.java | 19 ++++++++++++++-----
.../tika/parser/microsoft/HSLFExtractor.java | 18 +++++++++++++++---
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 3 +--
.../tika/parser/microsoft/ExcelParserTest.java | 10 ++++++++--
.../parser/microsoft/PowerPointParserTest.java | 11 +++++++++++
.../parser/microsoft/ooxml/OOXMLParserTest.java | 19 +++++++++++++++++++
.../test-documents/testExcel_embeddedPDF.xls | Bin 0 -> 38400 bytes
.../test-documents/testExcel_embeddedPDF.xlsx | Bin 0 -> 25602 bytes
.../test-documents/testPPT_EmbeddedPDF.ppt | Bin 0 -> 187392 bytes
.../test-documents/testPPT_EmbeddedPDF.pptx | Bin 0 -> 108637 bytes
11 files changed, 72 insertions(+), 13 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/7cc610e1/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 7e00048..34baef1 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,6 +1,9 @@
Release 1.14 - ???
- * Add parser for applefile (AppleSingle) (TIKA-2022)
+ * Improve extraction of embedded documents from PPT, PPT and XLSX
+ (TIKA-2026).
+
+ * Add parser for applefile (AppleSingle) (TIKA-2022).
* Add mime types, mime magic and/or globs for:
* Endnote Import File (TIKA-2011)
http://git-wip-us.apache.org/repos/asf/tika/blob/7cc610e1/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
index cf9d250..a240526 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
@@ -150,6 +150,15 @@ abstract class AbstractPOIFSExtractor {
protected void handleEmbeddedOfficeDoc(
DirectoryEntry dir, XHTMLContentHandler xhtml)
throws IOException, SAXException, TikaException {
+ handleEmbeddedOfficeDoc(dir, null, xhtml);
+ }
+
+ /**
+ * Handle an office document that's embedded at the POIFS level
+ */
+ protected void handleEmbeddedOfficeDoc(
+ DirectoryEntry dir, String resourceName, XHTMLContentHandler xhtml)
+ throws IOException, SAXException, TikaException {
// Is it an embedded OLE2 document, or an embedded OOXML document?
@@ -176,21 +185,21 @@ abstract class AbstractPOIFSExtractor {
}
POIFSDocumentType type = POIFSDocumentType.detectType(dir);
TikaInputStream embedded = null;
-
+ String rName = (resourceName == null) ? dir.getName() : resourceName;
try {
if (type == POIFSDocumentType.OLE10_NATIVE) {
try {
// Try to un-wrap the OLE10Native record:
Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode) dir);
if (ole.getLabel() != null) {
- metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '/' + ole.getLabel());
+ metadata.set(Metadata.RESOURCE_NAME_KEY, rName + '/' + ole.getLabel());
}
byte[] data = ole.getDataBuffer();
embedded = TikaInputStream.get(data);
} catch (Ole10NativeException ex) {
// Not a valid OLE10Native record, skip it
} catch (Exception e) {
- logger.warn("Ignoring unexpected exception while parsing possible OLE10_NATIVE embedded document " + dir.getName(), e);
+ logger.warn("Ignoring unexpected exception while parsing possible OLE10_NATIVE embedded document " + rName, e);
}
} else if (type == POIFSDocumentType.COMP_OBJ) {
try {
@@ -218,13 +227,13 @@ abstract class AbstractPOIFSExtractor {
// Record what we can do about it
metadata.set(Metadata.CONTENT_TYPE, mediaType.getType().toString());
- metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + extension);
+ metadata.set(Metadata.RESOURCE_NAME_KEY, rName + extension);
} catch (Exception e) {
throw new TikaException("Invalid embedded resource", e);
}
} else {
metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
- metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + type.getExtension());
+ metadata.set(Metadata.RESOURCE_NAME_KEY, rName + '.' + type.getExtension());
}
// Should we parse it?
http://git-wip-us.apache.org/repos/asf/tika/blob/7cc610e1/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
index e17dfe1..63c9e3f 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
@@ -39,7 +39,10 @@ import org.apache.poi.hslf.usermodel.HSLFTextShape;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.CloseShieldInputStream;
import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.SAXException;
@@ -378,10 +381,19 @@ public class HSLFExtractor extends AbstractPOIFSExtractor {
String mediaType = null;
if ("Excel.Chart.8".equals(oleShape.getProgID())) {
mediaType = "application/vnd.ms-excel";
+ } else {
+ MediaType mt = getTikaConfig().getDetector().detect(stream, new Metadata());
+ mediaType = mt.toString();
+ }
+ if (mediaType.equals("application/x-tika-msoffice-embedded; format=comp_obj")) {
+ try(NPOIFSFileSystem npoifs = new NPOIFSFileSystem(new CloseShieldInputStream(stream))) {
+ handleEmbeddedOfficeDoc(npoifs.getRoot(), objID, xhtml);
+ }
+ } else {
+ handleEmbeddedResource(
+ stream, objID, objID,
+ mediaType, xhtml, false);
}
- handleEmbeddedResource(
- stream, objID, objID,
- mediaType, xhtml, false);
}
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/7cc610e1/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index 84e9752..cd1919d 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -229,8 +229,7 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
if (root.hasEntry("CONTENTS")
&& root.hasEntry("\u0001Ole")
- && root.hasEntry("\u0001CompObj")
- && root.hasEntry("\u0003ObjInfo")) {
+ && root.hasEntry("\u0001CompObj")) {
// TIKA-704: OLE 2.0 embedded non-Office document?
//TODO: original file paths can be stored underneath root
//figure out how to extract that info for: TikaCoreProperties.ORIGINAL_RESOURCE_NAME
http://git-wip-us.apache.org/repos/asf/tika/blob/7cc610e1/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index 555ed3e..25aead6 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -16,13 +16,12 @@
*/
package org.apache.tika.parser.microsoft;
-import static org.apache.tika.TikaTest.assertContains;
-import static org.apache.tika.TikaTest.assertNotContained;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import java.io.InputStream;
+import java.util.List;
import java.util.Locale;
import org.apache.tika.TikaTest;
@@ -37,6 +36,7 @@ import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
@@ -456,4 +456,10 @@ public class ExcelParserTest extends TikaTest {
//link on textbox
// assertContains("<a href=\"http://tika.apache.org/1.12/gettingstarted.html\">", xml);
}
+
+ @Test
+ public void testEmbeddedPDF() throws Exception {
+ List<Metadata> metadataList = getRecursiveJson("testEXCEL_embeddedPDF.xls");
+ assertContains("Hello World!", metadataList.get(2).get(RecursiveParserWrapper.TIKA_CONTENT));
+ }
}
http://git-wip-us.apache.org/repos/asf/tika/blob/7cc610e1/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
index 55306b5..9ab5099 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
@@ -19,6 +19,7 @@ package org.apache.tika.parser.microsoft;
import static org.junit.Assert.assertEquals;
import java.io.InputStream;
+import java.util.List;
import java.util.Locale;
import org.apache.tika.TikaTest;
@@ -27,6 +28,7 @@ import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.OfficeOpenXMLCore;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
import org.xml.sax.ContentHandler;
@@ -238,4 +240,13 @@ public class PowerPointParserTest extends TikaTest {
XMLResult r = getXML("testPPT_comment.ppt");
assertContains("<p class=\"slide-comment\"><b>Allison, Timothy B. (ATB)", r.xml);
}
+
+ @Test
+ public void testEmbeddedPDF() throws Exception {
+ List<Metadata> metadataList = getRecursiveJson("testPPT_embeddedPDF.ppt");
+ assertContains("Apache Tika project", metadataList.get(1).get(RecursiveParserWrapper.TIKA_CONTENT));
+ assertEquals("3.pdf", metadataList.get(1).get(Metadata.RESOURCE_NAME_KEY));
+ assertContains("Hello World", metadataList.get(2).get(RecursiveParserWrapper.TIKA_CONTENT));
+ assertEquals("4.pdf", metadataList.get(2).get(Metadata.RESOURCE_NAME_KEY));
+ }
}
http://git-wip-us.apache.org/repos/asf/tika/blob/7cc610e1/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index b35a0f9..ea947d9 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -29,6 +29,7 @@ import java.io.InputStream;
import java.io.PrintStream;
import java.io.StringWriter;
import java.util.HashMap;
+import java.util.List;
import java.util.Locale;
import java.util.Map;
@@ -45,6 +46,7 @@ import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.parser.microsoft.WordParserTest;
import org.apache.tika.sax.BodyContentHandler;
import org.junit.Ignore;
@@ -1209,6 +1211,23 @@ public class OOXMLParserTest extends TikaTest {
//link on textbox
assertContains("<a href=\"http://tika.apache.org/1.12/gettingstarted.html\">", xml);
}
+
+ @Test
+ public void testEmbeddedPDFInPPTX() throws Exception {
+ List<Metadata> metadataList = getRecursiveJson("testPPT_embeddedPDF.pptx");
+ Metadata pdfMetadata1 = metadataList.get(2);
+ assertContains("Apache Tika", pdfMetadata1.get(RecursiveParserWrapper.TIKA_CONTENT));
+ Metadata pdfMetadata2 = metadataList.get(4);
+ assertContains("Hello World", pdfMetadata2.get(RecursiveParserWrapper.TIKA_CONTENT));
+ }
+
+ @Test
+ public void testEmbeddedPDFInXLSX() throws Exception {
+ List<Metadata> metadataList = getRecursiveJson("testEXCEL_embeddedPDF.xls");
+ Metadata pdfMetadata = metadataList.get(2);
+ assertContains("Hello World", pdfMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+ }
+
}
http://git-wip-us.apache.org/repos/asf/tika/blob/7cc610e1/tika-parsers/src/test/resources/test-documents/testExcel_embeddedPDF.xls
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testExcel_embeddedPDF.xls b/tika-parsers/src/test/resources/test-documents/testExcel_embeddedPDF.xls
new file mode 100644
index 0000000..c38f64c
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testExcel_embeddedPDF.xls differ
http://git-wip-us.apache.org/repos/asf/tika/blob/7cc610e1/tika-parsers/src/test/resources/test-documents/testExcel_embeddedPDF.xlsx
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testExcel_embeddedPDF.xlsx b/tika-parsers/src/test/resources/test-documents/testExcel_embeddedPDF.xlsx
new file mode 100644
index 0000000..9c0d2b9
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testExcel_embeddedPDF.xlsx differ
http://git-wip-us.apache.org/repos/asf/tika/blob/7cc610e1/tika-parsers/src/test/resources/test-documents/testPPT_EmbeddedPDF.ppt
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testPPT_EmbeddedPDF.ppt b/tika-parsers/src/test/resources/test-documents/testPPT_EmbeddedPDF.ppt
new file mode 100644
index 0000000..3129be1
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testPPT_EmbeddedPDF.ppt differ
http://git-wip-us.apache.org/repos/asf/tika/blob/7cc610e1/tika-parsers/src/test/resources/test-documents/testPPT_EmbeddedPDF.pptx
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testPPT_EmbeddedPDF.pptx b/tika-parsers/src/test/resources/test-documents/testPPT_EmbeddedPDF.pptx
new file mode 100644
index 0000000..a96aa3c
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testPPT_EmbeddedPDF.pptx differ