You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/29 00:54:53 UTC
[5/5] tika git commit: TIKA-2026 -- improve extraction of attachments
for PPT, PPTX, XLSX
TIKA-2026 -- improve extraction of attachments for PPT, PPTX, XLSX
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/dd3c2a48
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/dd3c2a48
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/dd3c2a48
Branch: refs/heads/2.x
Commit: dd3c2a486a41903d5ebeb4bf341be29e02af8499
Parents: 933af20
Author: tballison <ta...@mitre.org>
Authored: Tue Jun 28 20:54:40 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Tue Jun 28 20:54:40 2016 -0400
----------------------------------------------------------------------
CHANGES.txt | 5 +++-
.../microsoft/AbstractPOIFSExtractor.java | 19 ++++++++++----
.../tika/parser/microsoft/HSLFExtractor.java | 18 ++++++++++---
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 3 +--
.../tika/parser/microsoft/ExcelParserTest.java | 13 +++++++---
.../parser/microsoft/PowerPointParserTest.java | 14 ++++++++--
.../parser/microsoft/ooxml/OOXMLParserTest.java | 26 ++++++++++++++++---
.../test-documents/testEXCEL_embeddedPDF.xls | Bin 0 -> 38400 bytes
.../test-documents/testEXCEL_embeddedPDF.xlsx | Bin 0 -> 25602 bytes
.../test-documents/testPPT_embeddedPDF.ppt | Bin 0 -> 187392 bytes
.../test-documents/testPPT_embeddedPDF.pptx | Bin 0 -> 108637 bytes
11 files changed, 78 insertions(+), 20 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 766780f..64e1f53 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -17,7 +17,10 @@ Release 2.0 - ???
Release 1.14 - ???
- * Add parser for applefile (AppleSingle) (TIKA-2022)
+ * Improve extraction of embedded documents for PPT, PPTX and XLSX
+ (TIKA-2026).
+
+ * Add parser for applefile (AppleSingle) (TIKA-2022).
* Add mime types, mime magic and/or globs for:
* Endnote Import File (TIKA-2011)
http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
index 1225288..739af69 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
@@ -152,6 +152,15 @@ abstract class AbstractPOIFSExtractor {
protected void handleEmbeddedOfficeDoc(
DirectoryEntry dir, XHTMLContentHandler xhtml)
throws IOException, SAXException, TikaException {
+ handleEmbeddedOfficeDoc(dir, null, xhtml);
+ }
+
+ /**
+ * Handle an office document that's embedded at the POIFS level
+ */
+ protected void handleEmbeddedOfficeDoc(
+ DirectoryEntry dir, String resourceName, XHTMLContentHandler xhtml)
+ throws IOException, SAXException, TikaException {
// Is it an embedded OLE2 document, or an embedded OOXML document?
@@ -177,21 +186,21 @@ abstract class AbstractPOIFSExtractor {
}
POIFSDocumentType type = POIFSDocumentType.detectType(dir);
TikaInputStream embedded = null;
-
+ String rName = (resourceName == null) ? dir.getName() : resourceName;
try {
if (type == POIFSDocumentType.OLE10_NATIVE) {
try {
// Try to un-wrap the OLE10Native record:
Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode) dir);
if (ole.getLabel() != null) {
- metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '/' + ole.getLabel());
+ metadata.set(Metadata.RESOURCE_NAME_KEY, rName + '/' + ole.getLabel());
}
byte[] data = ole.getDataBuffer();
embedded = TikaInputStream.get(data);
} catch (Ole10NativeException ex) {
// Not a valid OLE10Native record, skip it
} catch (Exception e) {
- logger.warn("Ignoring unexpected exception while parsing possible OLE10_NATIVE embedded document " + dir.getName(), e);
+ logger.warn("Ignoring unexpected exception while parsing possible OLE10_NATIVE embedded document " + rName, e);
}
} else if (type == POIFSDocumentType.COMP_OBJ) {
try {
@@ -219,13 +228,13 @@ abstract class AbstractPOIFSExtractor {
// Record what we can do about it
metadata.set(Metadata.CONTENT_TYPE, mediaType.getType().toString());
- metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + extension);
+ metadata.set(Metadata.RESOURCE_NAME_KEY, rName + extension);
} catch (Exception e) {
throw new TikaException("Invalid embedded resource", e);
}
} else {
metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
- metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + type.getExtension());
+ metadata.set(Metadata.RESOURCE_NAME_KEY, rName + '.' + type.getExtension());
}
// Should we parse it?
http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
index 656fdbb..1b34f03 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
@@ -20,6 +20,7 @@ import java.io.IOException;
import java.util.HashSet;
import java.util.List;
+import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.poi.common.usermodel.Hyperlink;
import org.apache.poi.hslf.model.Comment;
import org.apache.poi.hslf.model.HeadersFooters;
@@ -40,6 +41,8 @@ import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.SAXException;
@@ -369,10 +372,19 @@ public class HSLFExtractor extends AbstractPOIFSExtractor {
String mediaType = null;
if ("Excel.Chart.8".equals(oleShape.getProgID())) {
mediaType = "application/vnd.ms-excel";
+ } else {
+ MediaType mt = getTikaConfig().getDetector().detect(stream, new Metadata());
+ mediaType = mt.toString();
+ }
+ if (mediaType.equals("application/x-tika-msoffice-embedded; format=comp_obj")) {
+ try(NPOIFSFileSystem npoifs = new NPOIFSFileSystem(new CloseShieldInputStream(stream))) {
+ handleEmbeddedOfficeDoc(npoifs.getRoot(), objID, xhtml);
+ }
+ } else {
+ handleEmbeddedResource(
+ stream, objID, objID,
+ mediaType, xhtml, false);
}
- handleEmbeddedResource(
- stream, objID, objID,
- mediaType, xhtml, false);
}
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index 84e9752..cd1919d 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -229,8 +229,7 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
if (root.hasEntry("CONTENTS")
&& root.hasEntry("\u0001Ole")
- && root.hasEntry("\u0001CompObj")
- && root.hasEntry("\u0003ObjInfo")) {
+ && root.hasEntry("\u0001CompObj")) {
// TIKA-704: OLE 2.0 embedded non-Office document?
//TODO: original file paths can be stored underneath root
//figure out how to extract that info for: TikaCoreProperties.ORIGINAL_RESOURCE_NAME
http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index 3e98aa9..196ffa9 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -16,13 +16,14 @@
*/
package org.apache.tika.parser.microsoft;
+import java.io.InputStream;
+import java.util.List;
+import java.util.Locale;
+
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
-import java.io.InputStream;
-import java.util.Locale;
-
import org.apache.tika.TikaTest;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
@@ -402,4 +403,10 @@ public class ExcelParserTest extends TikaTest {
//link on textbox
// assertContains("<a href=\"http://tika.apache.org/1.12/gettingstarted.html\">", xml);
}
+
+ @Test
+ public void testEmbeddedPDF() throws Exception {
+ List<Metadata> metadataList = getRecursiveJson("testEXCEL_embeddedPDF.xls");
+ assertEquals("application/pdf", metadataList.get(2).get(Metadata.CONTENT_TYPE));
+ }
}
http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
index e0eee56..32d462e 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
@@ -16,11 +16,12 @@
*/
package org.apache.tika.parser.microsoft;
-import static org.junit.Assert.assertEquals;
-
import java.io.InputStream;
+import java.util.List;
import java.util.Locale;
+import static org.junit.Assert.assertEquals;
+
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
@@ -238,4 +239,13 @@ public class PowerPointParserTest extends TikaTest {
XMLResult r = getXML("testPPT_comment.ppt");
assertContains("<p class=\"slide-comment\"><b>Allison, Timothy B. (ATB)", r.xml);
}
+
+ @Test
+ public void testEmbeddedPDF() throws Exception {
+ List<Metadata> metadataList = getRecursiveJson("testPPT_embeddedPDF.ppt");
+ assertEquals("application/pdf", metadataList.get(1).get(Metadata.CONTENT_TYPE));
+ assertEquals("3.pdf", metadataList.get(1).get(Metadata.RESOURCE_NAME_KEY));
+ assertEquals("application/pdf", metadataList.get(2).get(Metadata.CONTENT_TYPE));
+ assertEquals("4.pdf", metadataList.get(2).get(Metadata.RESOURCE_NAME_KEY));
+ }
}
http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index b442d07..5159ade 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -16,10 +16,6 @@
*/
package org.apache.tika.parser.microsoft.ooxml;
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
import javax.xml.transform.OutputKeys;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
@@ -29,9 +25,14 @@ import java.io.InputStream;
import java.io.PrintStream;
import java.io.StringWriter;
import java.util.HashMap;
+import java.util.List;
import java.util.Locale;
import java.util.Map;
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
import org.apache.tika.TikaTest;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.io.TikaInputStream;
@@ -1209,6 +1210,23 @@ public class OOXMLParserTest extends TikaTest {
//link on textbox
assertContains("<a href=\"http://tika.apache.org/1.12/gettingstarted.html\">", xml);
}
+
+ @Test
+ public void testEmbeddedPDFInPPTX() throws Exception {
+ List<Metadata> metadataList = getRecursiveJson("testPPT_embeddedPDF.pptx");
+ Metadata pdfMetadata1 = metadataList.get(2);
+ assertEquals("application/pdf", pdfMetadata1.get(Metadata.CONTENT_TYPE));
+ Metadata pdfMetadata2 = metadataList.get(4);
+ assertEquals("application/pdf", pdfMetadata2.get(Metadata.CONTENT_TYPE));
+ }
+
+ @Test
+ public void testEmbeddedPDFInXLSX() throws Exception {
+ List<Metadata> metadataList = getRecursiveJson("testEXCEL_embeddedPDF.xls");
+ Metadata pdfMetadata = metadataList.get(2);
+ assertEquals("application/pdf", pdfMetadata.get(Metadata.CONTENT_TYPE));
+ }
+
}
http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/tika-test-resources/src/test/resources/test-documents/testEXCEL_embeddedPDF.xls
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testEXCEL_embeddedPDF.xls b/tika-test-resources/src/test/resources/test-documents/testEXCEL_embeddedPDF.xls
new file mode 100644
index 0000000..c38f64c
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testEXCEL_embeddedPDF.xls differ
http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/tika-test-resources/src/test/resources/test-documents/testEXCEL_embeddedPDF.xlsx
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testEXCEL_embeddedPDF.xlsx b/tika-test-resources/src/test/resources/test-documents/testEXCEL_embeddedPDF.xlsx
new file mode 100644
index 0000000..9c0d2b9
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testEXCEL_embeddedPDF.xlsx differ
http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/tika-test-resources/src/test/resources/test-documents/testPPT_embeddedPDF.ppt
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testPPT_embeddedPDF.ppt b/tika-test-resources/src/test/resources/test-documents/testPPT_embeddedPDF.ppt
new file mode 100644
index 0000000..3129be1
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testPPT_embeddedPDF.ppt differ
http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/tika-test-resources/src/test/resources/test-documents/testPPT_embeddedPDF.pptx
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testPPT_embeddedPDF.pptx b/tika-test-resources/src/test/resources/test-documents/testPPT_embeddedPDF.pptx
new file mode 100644
index 0000000..a96aa3c
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testPPT_embeddedPDF.pptx differ