You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/29 00:54:53 UTC

[5/5] tika git commit: TIKA-2026 -- improve extraction of attachments for PPT, PPTX, XLSX

TIKA-2026 -- improve extraction of attachments for PPT, PPTX, XLSX


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/dd3c2a48
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/dd3c2a48
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/dd3c2a48

Branch: refs/heads/2.x
Commit: dd3c2a486a41903d5ebeb4bf341be29e02af8499
Parents: 933af20
Author: tballison <ta...@mitre.org>
Authored: Tue Jun 28 20:54:40 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Tue Jun 28 20:54:40 2016 -0400

----------------------------------------------------------------------
 CHANGES.txt                                     |   5 +++-
 .../microsoft/AbstractPOIFSExtractor.java       |  19 ++++++++++----
 .../tika/parser/microsoft/HSLFExtractor.java    |  18 ++++++++++---
 .../microsoft/ooxml/AbstractOOXMLExtractor.java |   3 +--
 .../tika/parser/microsoft/ExcelParserTest.java  |  13 +++++++---
 .../parser/microsoft/PowerPointParserTest.java  |  14 ++++++++--
 .../parser/microsoft/ooxml/OOXMLParserTest.java |  26 ++++++++++++++++---
 .../test-documents/testEXCEL_embeddedPDF.xls    | Bin 0 -> 38400 bytes
 .../test-documents/testEXCEL_embeddedPDF.xlsx   | Bin 0 -> 25602 bytes
 .../test-documents/testPPT_embeddedPDF.ppt      | Bin 0 -> 187392 bytes
 .../test-documents/testPPT_embeddedPDF.pptx     | Bin 0 -> 108637 bytes
 11 files changed, 78 insertions(+), 20 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 766780f..64e1f53 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -17,7 +17,10 @@ Release 2.0 - ???
 
 Release 1.14 - ???
 
-  * Add parser for applefile (AppleSingle) (TIKA-2022)
+  * Improve extraction of embedded documents for PPT, PPTX and XLSX
+    (TIKA-2026).
+
+  * Add parser for applefile (AppleSingle) (TIKA-2022).
 
   * Add mime types, mime magic and/or globs for:
      * Endnote Import File (TIKA-2011)

http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
index 1225288..739af69 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
@@ -152,6 +152,15 @@ abstract class AbstractPOIFSExtractor {
     protected void handleEmbeddedOfficeDoc(
             DirectoryEntry dir, XHTMLContentHandler xhtml)
             throws IOException, SAXException, TikaException {
+        handleEmbeddedOfficeDoc(dir, null, xhtml);
+    }
+
+    /**
+     * Handle an office document that's embedded at the POIFS level
+     */
+    protected void handleEmbeddedOfficeDoc(
+            DirectoryEntry dir, String resourceName, XHTMLContentHandler xhtml)
+            throws IOException, SAXException, TikaException {
 
         // Is it an embedded OLE2 document, or an embedded OOXML document?
 
@@ -177,21 +186,21 @@ abstract class AbstractPOIFSExtractor {
         }
         POIFSDocumentType type = POIFSDocumentType.detectType(dir);
         TikaInputStream embedded = null;
-
+        String rName = (resourceName == null) ? dir.getName() : resourceName;
         try {
             if (type == POIFSDocumentType.OLE10_NATIVE) {
                 try {
                     // Try to un-wrap the OLE10Native record:
                     Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode) dir);
                     if (ole.getLabel() != null) {
-                        metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '/' + ole.getLabel());
+                        metadata.set(Metadata.RESOURCE_NAME_KEY, rName + '/' + ole.getLabel());
                     }
                     byte[] data = ole.getDataBuffer();
                     embedded = TikaInputStream.get(data);
                 } catch (Ole10NativeException ex) {
                     // Not a valid OLE10Native record, skip it
                 } catch (Exception e) {
-                    logger.warn("Ignoring unexpected exception while parsing possible OLE10_NATIVE embedded document " + dir.getName(), e);
+                    logger.warn("Ignoring unexpected exception while parsing possible OLE10_NATIVE embedded document " + rName, e);
                 }
             } else if (type == POIFSDocumentType.COMP_OBJ) {
                 try {
@@ -219,13 +228,13 @@ abstract class AbstractPOIFSExtractor {
 
                     // Record what we can do about it
                     metadata.set(Metadata.CONTENT_TYPE, mediaType.getType().toString());
-                    metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + extension);
+                    metadata.set(Metadata.RESOURCE_NAME_KEY, rName + extension);
                 } catch (Exception e) {
                     throw new TikaException("Invalid embedded resource", e);
                 }
             } else {
                 metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
-                metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + type.getExtension());
+                metadata.set(Metadata.RESOURCE_NAME_KEY, rName + '.' + type.getExtension());
             }
 
             // Should we parse it?

http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
index 656fdbb..1b34f03 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
@@ -20,6 +20,7 @@ import java.io.IOException;
 import java.util.HashSet;
 import java.util.List;
 
+import org.apache.commons.io.input.CloseShieldInputStream;
 import org.apache.poi.common.usermodel.Hyperlink;
 import org.apache.poi.hslf.model.Comment;
 import org.apache.poi.hslf.model.HeadersFooters;
@@ -40,6 +41,8 @@ import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.SAXException;
@@ -369,10 +372,19 @@ public class HSLFExtractor extends AbstractPOIFSExtractor {
                         String mediaType = null;
                         if ("Excel.Chart.8".equals(oleShape.getProgID())) {
                             mediaType = "application/vnd.ms-excel";
+                        } else {
+                            MediaType mt = getTikaConfig().getDetector().detect(stream, new Metadata());
+                            mediaType = mt.toString();
+                        }
+                        if (mediaType.equals("application/x-tika-msoffice-embedded; format=comp_obj")) {
+                            try(NPOIFSFileSystem npoifs = new NPOIFSFileSystem(new CloseShieldInputStream(stream))) {
+                                handleEmbeddedOfficeDoc(npoifs.getRoot(), objID, xhtml);
+                            }
+                        } else {
+                            handleEmbeddedResource(
+                                    stream, objID, objID,
+                                    mediaType, xhtml, false);
                         }
-                        handleEmbeddedResource(
-                                stream, objID, objID,
-                                mediaType, xhtml, false);
                     }
                 }
             }

http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index 84e9752..cd1919d 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -229,8 +229,7 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
 
             if (root.hasEntry("CONTENTS")
                     && root.hasEntry("\u0001Ole")
-                    && root.hasEntry("\u0001CompObj")
-                    && root.hasEntry("\u0003ObjInfo")) {
+                    && root.hasEntry("\u0001CompObj")) {
                 // TIKA-704: OLE 2.0 embedded non-Office document?
                 //TODO: original file paths can be stored underneath root
                 //figure out how to extract that info for: TikaCoreProperties.ORIGINAL_RESOURCE_NAME

http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index 3e98aa9..196ffa9 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -16,13 +16,14 @@
  */
 package org.apache.tika.parser.microsoft;
 
+import java.io.InputStream;
+import java.util.List;
+import java.util.Locale;
+
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 
-import java.io.InputStream;
-import java.util.Locale;
-
 import org.apache.tika.TikaTest;
 import org.apache.tika.detect.DefaultDetector;
 import org.apache.tika.detect.Detector;
@@ -402,4 +403,10 @@ public class ExcelParserTest extends TikaTest {
         //link on textbox
 //        assertContains("<a href=\"http://tika.apache.org/1.12/gettingstarted.html\">", xml);
     }
+
+    @Test
+    public void testEmbeddedPDF() throws Exception {
+        List<Metadata> metadataList = getRecursiveJson("testEXCEL_embeddedPDF.xls");
+        assertEquals("application/pdf", metadataList.get(2).get(Metadata.CONTENT_TYPE));
+    }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
index e0eee56..32d462e 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
@@ -16,11 +16,12 @@
  */
 package org.apache.tika.parser.microsoft;
 
-import static org.junit.Assert.assertEquals;
-
 import java.io.InputStream;
+import java.util.List;
 import java.util.Locale;
 
+import static org.junit.Assert.assertEquals;
+
 import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Office;
@@ -238,4 +239,13 @@ public class PowerPointParserTest extends TikaTest {
         XMLResult r = getXML("testPPT_comment.ppt");
         assertContains("<p class=\"slide-comment\"><b>Allison, Timothy B. (ATB)", r.xml);
     }
+
+    @Test
+    public void testEmbeddedPDF() throws Exception {
+        List<Metadata> metadataList = getRecursiveJson("testPPT_embeddedPDF.ppt");
+        assertEquals("application/pdf", metadataList.get(1).get(Metadata.CONTENT_TYPE));
+        assertEquals("3.pdf", metadataList.get(1).get(Metadata.RESOURCE_NAME_KEY));
+        assertEquals("application/pdf", metadataList.get(2).get(Metadata.CONTENT_TYPE));
+        assertEquals("4.pdf", metadataList.get(2).get(Metadata.RESOURCE_NAME_KEY));
+    }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index b442d07..5159ade 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -16,10 +16,6 @@
  */
 package org.apache.tika.parser.microsoft.ooxml;
 
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
 import javax.xml.transform.OutputKeys;
 import javax.xml.transform.sax.SAXTransformerFactory;
 import javax.xml.transform.sax.TransformerHandler;
@@ -29,9 +25,14 @@ import java.io.InputStream;
 import java.io.PrintStream;
 import java.io.StringWriter;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Locale;
 import java.util.Map;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
 import org.apache.tika.TikaTest;
 import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.io.TikaInputStream;
@@ -1209,6 +1210,23 @@ public class OOXMLParserTest extends TikaTest {
         //link on textbox
         assertContains("<a href=\"http://tika.apache.org/1.12/gettingstarted.html\">", xml);
     }
+
+    @Test
+    public void testEmbeddedPDFInPPTX() throws Exception {
+        List<Metadata> metadataList = getRecursiveJson("testPPT_embeddedPDF.pptx");
+        Metadata pdfMetadata1 = metadataList.get(2);
+        assertEquals("application/pdf", pdfMetadata1.get(Metadata.CONTENT_TYPE));
+        Metadata pdfMetadata2 = metadataList.get(4);
+        assertEquals("application/pdf", pdfMetadata2.get(Metadata.CONTENT_TYPE));
+    }
+
+    @Test
+    public void testEmbeddedPDFInXLSX() throws Exception {
+        List<Metadata> metadataList = getRecursiveJson("testEXCEL_embeddedPDF.xls");
+        Metadata pdfMetadata = metadataList.get(2);
+        assertEquals("application/pdf", pdfMetadata.get(Metadata.CONTENT_TYPE));
+    }
+
 }
 
 

http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/tika-test-resources/src/test/resources/test-documents/testEXCEL_embeddedPDF.xls
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testEXCEL_embeddedPDF.xls b/tika-test-resources/src/test/resources/test-documents/testEXCEL_embeddedPDF.xls
new file mode 100644
index 0000000..c38f64c
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testEXCEL_embeddedPDF.xls differ

http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/tika-test-resources/src/test/resources/test-documents/testEXCEL_embeddedPDF.xlsx
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testEXCEL_embeddedPDF.xlsx b/tika-test-resources/src/test/resources/test-documents/testEXCEL_embeddedPDF.xlsx
new file mode 100644
index 0000000..9c0d2b9
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testEXCEL_embeddedPDF.xlsx differ

http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/tika-test-resources/src/test/resources/test-documents/testPPT_embeddedPDF.ppt
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testPPT_embeddedPDF.ppt b/tika-test-resources/src/test/resources/test-documents/testPPT_embeddedPDF.ppt
new file mode 100644
index 0000000..3129be1
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testPPT_embeddedPDF.ppt differ

http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/tika-test-resources/src/test/resources/test-documents/testPPT_embeddedPDF.pptx
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testPPT_embeddedPDF.pptx b/tika-test-resources/src/test/resources/test-documents/testPPT_embeddedPDF.pptx
new file mode 100644
index 0000000..a96aa3c
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testPPT_embeddedPDF.pptx differ