You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/12/19 21:06:22 UTC

tika git commit: TIKA-2218 -- add a few more places where .pptx can include embedded objects

Repository: tika
Updated Branches:
  refs/heads/master 90cdf1f6a -> ca37313a7


TIKA-2218 -- add a few more places where .pptx can include embedded objects


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/ca37313a
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/ca37313a
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/ca37313a

Branch: refs/heads/master
Commit: ca37313a716d4eaa3a15a4ba770f89ee23832e99
Parents: 90cdf1f
Author: tballison <ta...@mitre.org>
Authored: Mon Dec 19 16:06:13 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Mon Dec 19 16:06:13 2016 -0500

----------------------------------------------------------------------
 .../SXSLFPowerPointExtractorDecorator.java      | 52 ++++++++++++---
 .../ooxml/XSLFPowerPointExtractorDecorator.java | 60 ++++++++++++++----
 .../ooxml/OOXMLContainerExtractionTest.java     | 67 ++++++++++----------
 .../parser/microsoft/ooxml/OOXMLParserTest.java |  4 +-
 .../microsoft/ooxml/SXSLFExtractorTest.java     |  2 +-
 5 files changed, 130 insertions(+), 55 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/ca37313a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
index 1ab8bd3..21577c4 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
@@ -251,21 +251,57 @@ public class SXSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
             PackageRelationshipCollection prc = mainDocument.getRelationshipsByType(XSLFRelation.SLIDE.getRelation());
             for (int i = 0; i < prc.size(); i++) {
                 PackagePart slidePart = mainDocument.getRelatedPart(prc.getRelationship(i));
-                for (PackageRelationship rel : slidePart.getRelationshipsByType(XSLFRelation.VML_DRAWING.getRelation())) {
-                    if (rel.getTargetMode() == TargetMode.INTERNAL) {
-                        PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
-                        parts.add(rel.getPackage().getPart(relName));
-                    }
-                }
-                parts.add(slidePart);
+                addSlideParts(slidePart, parts);
             }
         } catch (InvalidFormatException e) {
-            //do something
+            //log
         }
+
         parts.add(mainDocument);
+        for (String rel : new String[]{
+                XSLFRelation.SLIDE_MASTER.getRelation(),
+                HANDOUT_MASTER}) {
+            try {
+                PackageRelationshipCollection prc = mainDocument.getRelationshipsByType(rel);
+                for (int i = 0; i < prc.size(); i++) {
+                    PackagePart pp = mainDocument.getRelatedPart(prc.getRelationship(i));
+                    if (pp != null) {
+                        parts.add(pp);
+                    }
+                }
+
+            } catch (InvalidFormatException e) {
+                //log
+            }
+        }
+
         return parts;
     }
 
+    private void addSlideParts(PackagePart slidePart, List<PackagePart> parts) {
+
+        for (String relation : new String[]{
+                XSLFRelation.VML_DRAWING.getRelation(),
+                XSLFRelation.SLIDE_LAYOUT.getRelation(),
+                XSLFRelation.NOTES_MASTER.getRelation(),
+                XSLFRelation.NOTES.getRelation()
+        }) {
+            try {
+                for (PackageRelationship packageRelationship : slidePart.getRelationshipsByType(relation)) {
+                    if (packageRelationship.getTargetMode() == TargetMode.INTERNAL) {
+                        PackagePartName relName = PackagingURIHelper.createPartName(packageRelationship.getTargetURI());
+                        parts.add(packageRelationship.getPackage().getPart(relName));
+                    }
+                }
+            } catch (InvalidFormatException e) {
+
+            }
+        }
+        //and slide of course
+        parts.add(slidePart);
+
+    }
+
     private class XSLFCommentsHandler extends DefaultHandler {
 
         private String commentAuthorId = null;

http://git-wip-us.apache.org/repos/asf/tika/blob/ca37313a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
index 0bf0ad3..d2bacb3 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
@@ -26,6 +26,7 @@ import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
 import org.apache.poi.openxml4j.opc.PackagePart;
 import org.apache.poi.openxml4j.opc.PackagePartName;
 import org.apache.poi.openxml4j.opc.PackageRelationship;
+import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
 import org.apache.poi.openxml4j.opc.PackagingURIHelper;
 import org.apache.poi.openxml4j.opc.TargetMode;
 import org.apache.poi.sl.usermodel.Placeholder;
@@ -45,6 +46,10 @@ import org.xml.sax.SAXException;
 import org.xml.sax.helpers.AttributesImpl;
 
 public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
+
+    private final static String HANDOUT_MASTER = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/handoutMaster";
+
+
     public XSLFPowerPointExtractorDecorator(ParseContext context, XSLFPowerPointExtractor extractor) {
         super(context, extractor);
     }
@@ -267,24 +272,55 @@ public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
                 } catch (XmlException xe) {
                     throw new TikaException("Broken OOXML file", xe);
                 }
-                parts.add(slidePart);
+                addSlideParts(slidePart, parts);
+            }
+        }
+        //add full document to include macros
+        parts.add(document.getPackagePart());
 
-                // If it has drawings, return those too
-                try {
-                    for (PackageRelationship rel : slidePart.getRelationshipsByType(XSLFRelation.VML_DRAWING.getRelation())) {
-                        if (rel.getTargetMode() == TargetMode.INTERNAL) {
-                            PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
-                            parts.add(rel.getPackage().getPart(relName));
-                        }
+        for (String rel : new String[]{
+                XSLFRelation.SLIDE_MASTER.getRelation(),
+                HANDOUT_MASTER}) {
+            try {
+                PackageRelationshipCollection prc = document.getPackagePart().getRelationshipsByType(rel);
+                for (int i = 0; i < prc.size(); i++) {
+                    PackagePart pp = document.getPackagePart().getRelatedPart(prc.getRelationship(i));
+                    if (pp != null) {
+                        parts.add(pp);
                     }
-                } catch (InvalidFormatException e) {
-                    throw new TikaException("Broken OOXML file", e);
                 }
+
+            } catch (InvalidFormatException e) {
+                //log
             }
         }
-        //add full document to include macros
-        parts.add(document.getPackagePart());
 
         return parts;
     }
+
+
+    private void addSlideParts(PackagePart slidePart, List<PackagePart> parts) {
+
+        for (String relation : new String[]{
+                XSLFRelation.VML_DRAWING.getRelation(),
+                XSLFRelation.SLIDE_LAYOUT.getRelation(),
+                XSLFRelation.NOTES_MASTER.getRelation(),
+                XSLFRelation.NOTES.getRelation()
+        }) {
+            try {
+                for (PackageRelationship packageRelationship : slidePart.getRelationshipsByType(relation)) {
+                    if (packageRelationship.getTargetMode() == TargetMode.INTERNAL) {
+                        PackagePartName relName = PackagingURIHelper.createPartName(packageRelationship.getTargetURI());
+                        parts.add(packageRelationship.getPackage().getPart(relName));
+                    }
+                }
+            } catch (InvalidFormatException e) {
+
+            }
+        }
+        //and slide of course
+        parts.add(slidePart);
+
+    }
+
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/ca37313a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
index 6c3c8d5..9cc300b 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
@@ -170,21 +170,22 @@ public class OOXMLContainerExtractionTest extends AbstractPOIContainerExtraction
         assertEquals(23 + 1 /*thumbnail */, handler.filenames.size());
         assertEquals(23 + 1 /*thumbnail */, handler.mediaTypes.size());
 
+
         assertEquals(TYPE_PPTX, handler.mediaTypes.get(0)); // Embedded office doc
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(1));  //   PNG inside .pptx
-        assertEquals(TYPE_GIF, handler.mediaTypes.get(2));  //   PNG inside .pptx
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(3));  //   PNG inside .pptx
-        assertEquals(TYPE_XLSX, handler.mediaTypes.get(4)); //   .xlsx inside .pptx
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); //     PNG inside .xlsx inside .pptx
-        assertEquals(TYPE_DOCX, handler.mediaTypes.get(6)); //   .docx inside .pptx
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(7)); //     PNG inside .docx inside .pptx
-        assertEquals(TYPE_JPG, handler.mediaTypes.get(8)); //     JPG inside .docx inside .pptx
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); //     PNG inside .docx inside .pptx
-        assertEquals(TYPE_DOC, handler.mediaTypes.get(10)); //   .doc inside .pptx
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(11)); //    PNG inside .doc inside .pptx
-        assertEquals(TYPE_EMF, handler.mediaTypes.get(12)); //   Icon of item inside .pptx
-        assertEquals(TYPE_EMF, handler.mediaTypes.get(13)); //   Icon of item inside .pptx
-        assertEquals(TYPE_EMF, handler.mediaTypes.get(14)); //   Icon of item inside .pptx
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); //   Icon of item inside .pptx
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(2)); //   Icon of item inside .pptx
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(3)); //   Icon of item inside .pptx
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(4));  //   PNG inside .pptx
+        assertEquals(TYPE_GIF, handler.mediaTypes.get(5));  //   PNG inside .pptx
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(6));  //   PNG inside .pptx
+        assertEquals(TYPE_XLSX, handler.mediaTypes.get(7)); //   .xlsx inside .pptx
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(8)); //     PNG inside .xlsx inside .pptx
+        assertEquals(TYPE_DOCX, handler.mediaTypes.get(9)); //   .docx inside .pptx
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(10)); //     PNG inside .docx inside .pptx
+        assertEquals(TYPE_JPG, handler.mediaTypes.get(11)); //     JPG inside .docx inside .pptx
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(12)); //     PNG inside .docx inside .pptx
+        assertEquals(TYPE_DOC, handler.mediaTypes.get(13)); //   .doc inside .pptx
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(14)); //    PNG inside .doc inside .pptx
         assertEquals(TYPE_JPG, handler.mediaTypes.get(15));  // Embedded thumbnail
         assertEquals(TYPE_DOC, handler.mediaTypes.get(16));  // Embedded office doc
         assertEquals(TYPE_PNG, handler.mediaTypes.get(17));  //   PNG inside .doc
@@ -252,25 +253,27 @@ public class OOXMLContainerExtractionTest extends AbstractPOIContainerExtraction
         assertEquals(9 + 1 /* thumbnail */, handler.mediaTypes.size());
 
         // We don't know their exact filenames
-        assertEquals("image4.png", handler.filenames.get(0));
-        assertEquals("image5.gif", handler.filenames.get(1));
-        assertEquals("image6.png", handler.filenames.get(2));
-        assertEquals("Microsoft_Office_Excel_Worksheet1.xlsx", handler.filenames.get(3));
-        assertEquals("Microsoft_Office_Word_Document2.docx", handler.filenames.get(4));
-        assertEquals("Microsoft_Office_Word_97_-_2003_Document1.doc", handler.filenames.get(5));
-        assertEquals("image1.emf", handler.filenames.get(6));
-        assertEquals("image2.emf", handler.filenames.get(7));
-        assertEquals("image3.emf", handler.filenames.get(8));
+        assertEquals("image1.emf", handler.filenames.get(0));
+        assertEquals("image2.emf", handler.filenames.get(1));
+        assertEquals("image3.emf", handler.filenames.get(2));
+        assertEquals("image4.png", handler.filenames.get(3));
+        assertEquals("image5.gif", handler.filenames.get(4));
+        assertEquals("image6.png", handler.filenames.get(5));
+        assertEquals("Microsoft_Office_Excel_Worksheet1.xlsx", handler.filenames.get(6));
+        assertEquals("Microsoft_Office_Word_Document2.docx", handler.filenames.get(7));
+        assertEquals("Microsoft_Office_Word_97_-_2003_Document1.doc", handler.filenames.get(8));
+        assertEquals("/docProps/thumbnail.jpeg", handler.filenames.get(9));
+
         // But we do know their types
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(0));  // Embedded image
-        assertEquals(TYPE_GIF, handler.mediaTypes.get(1));  // Embedded image
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(2));  // Embedded image
-        assertEquals(TYPE_XLSX, handler.mediaTypes.get(3)); // Embedded office doc
-        assertEquals(TYPE_DOCX, handler.mediaTypes.get(4)); // Embedded office doc
-        assertEquals(TYPE_DOC, handler.mediaTypes.get(5));  // Embedded office doc
-        assertEquals(TYPE_EMF, handler.mediaTypes.get(6));  // Icon of embedded office doc
-        assertEquals(TYPE_EMF, handler.mediaTypes.get(7));  // Icon of embedded office doc
-        assertEquals(TYPE_EMF, handler.mediaTypes.get(8));  // Icon of embedded office doc
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(0));  // Icon of embedded office doc
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(1));  // Icon of embedded office doc
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(2));  // Icon of embedded office doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(3));  // Embedded image
+        assertEquals(TYPE_GIF, handler.mediaTypes.get(4));  // Embedded image
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(5));  // Embedded image
+        assertEquals(TYPE_XLSX, handler.mediaTypes.get(6)); // Embedded office doc
+        assertEquals(TYPE_DOCX, handler.mediaTypes.get(7)); // Embedded office doc
+        assertEquals(TYPE_DOC, handler.mediaTypes.get(8));  // Embedded office doc
     }
 
     @Test

http://git-wip-us.apache.org/repos/asf/tika/blob/ca37313a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index e84f6d0..6464e79 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -1235,9 +1235,9 @@ public class OOXMLParserTest extends TikaTest {
     @Test
     public void testEmbeddedPDFInPPTX() throws Exception {
         List<Metadata> metadataList = getRecursiveMetadata("testPPT_EmbeddedPDF.pptx");
-        Metadata pdfMetadata1 = metadataList.get(2);
+        Metadata pdfMetadata1 = metadataList.get(4);
         assertContains("Apache Tika", pdfMetadata1.get(RecursiveParserWrapper.TIKA_CONTENT));
-        Metadata pdfMetadata2 = metadataList.get(4);
+        Metadata pdfMetadata2 = metadataList.get(5);
         assertContains("Hello World", pdfMetadata2.get(RecursiveParserWrapper.TIKA_CONTENT));
     }
 

http://git-wip-us.apache.org/repos/asf/tika/blob/ca37313a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
index d8df4c9..305b2e4 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
@@ -63,7 +63,7 @@ public class SXSLFExtractorTest extends TikaTest {
 
         List<Metadata> metadataList = getRecursiveMetadata("testPPT_various2.pptx", parseContext);
 
-        assertEquals("right number of attachments", 10, metadataList.size());
+        assertEquals("right number of attachments", 14, metadataList.size());
 
         String mainContent = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);