You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/12/19 21:08:36 UTC

[1/2] tika git commit: TIKA-2218 -- add a new new locations within a pptx to check for embedded objects

Repository: tika
Updated Branches:
  refs/heads/2.x d8853fe31 -> ffb25af1b


 TIKA-2218 -- add a new new locations within a pptx to check for embedded objects


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/4f04b6c3
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/4f04b6c3
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/4f04b6c3

Branch: refs/heads/2.x
Commit: 4f04b6c3e9645bfe5fdb7d7f1078051c0eca7fcc
Parents: 300100f
Author: tballison <ta...@mitre.org>
Authored: Mon Dec 19 16:08:09 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Mon Dec 19 16:08:09 2016 -0500

----------------------------------------------------------------------
 .../ooxml/XSLFPowerPointExtractorDecorator.java | 60 ++++++++++++++----
 .../ooxml/OOXMLContainerExtractionTest.java     | 67 ++++++++++----------
 .../parser/microsoft/ooxml/OOXMLParserTest.java |  4 +-
 3 files changed, 85 insertions(+), 46 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/4f04b6c3/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
index 130a2f8..394c903 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
@@ -26,6 +26,7 @@ import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
 import org.apache.poi.openxml4j.opc.PackagePart;
 import org.apache.poi.openxml4j.opc.PackagePartName;
 import org.apache.poi.openxml4j.opc.PackageRelationship;
+import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
 import org.apache.poi.openxml4j.opc.PackagingURIHelper;
 import org.apache.poi.openxml4j.opc.TargetMode;
 import org.apache.poi.sl.usermodel.Placeholder;
@@ -45,6 +46,10 @@ import org.xml.sax.SAXException;
 import org.xml.sax.helpers.AttributesImpl;
 
 public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
+
+    private final static String HANDOUT_MASTER = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/handoutMaster";
+
+
     public XSLFPowerPointExtractorDecorator(ParseContext context, XSLFPowerPointExtractor extractor) {
         super(context, extractor);
     }
@@ -267,24 +272,55 @@ public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
                 } catch (XmlException xe) {
                     throw new TikaException("Broken OOXML file", xe);
                 }
-                parts.add(slidePart);
+                addSlideParts(slidePart, parts);
+            }
+        }
+        //add full document to include macros
+        parts.add(document.getPackagePart());
 
-                // If it has drawings, return those too
-                try {
-                    for (PackageRelationship rel : slidePart.getRelationshipsByType(XSLFRelation.VML_DRAWING.getRelation())) {
-                        if (rel.getTargetMode() == TargetMode.INTERNAL) {
-                            PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
-                            parts.add(rel.getPackage().getPart(relName));
-                        }
+        for (String rel : new String[]{
+                XSLFRelation.SLIDE_MASTER.getRelation(),
+                HANDOUT_MASTER}) {
+            try {
+                PackageRelationshipCollection prc = document.getPackagePart().getRelationshipsByType(rel);
+                for (int i = 0; i < prc.size(); i++) {
+                    PackagePart pp = document.getPackagePart().getRelatedPart(prc.getRelationship(i));
+                    if (pp != null) {
+                        parts.add(pp);
                     }
-                } catch (InvalidFormatException e) {
-                    throw new TikaException("Broken OOXML file", e);
                 }
+
+            } catch (InvalidFormatException e) {
+                //log
             }
         }
-        //add full document to include macros
-        parts.add(document.getPackagePart());
 
         return parts;
     }
+
+
+    private void addSlideParts(PackagePart slidePart, List<PackagePart> parts) {
+
+        for (String relation : new String[]{
+                XSLFRelation.VML_DRAWING.getRelation(),
+                XSLFRelation.SLIDE_LAYOUT.getRelation(),
+                XSLFRelation.NOTES_MASTER.getRelation(),
+                XSLFRelation.NOTES.getRelation()
+        }) {
+            try {
+                for (PackageRelationship packageRelationship : slidePart.getRelationshipsByType(relation)) {
+                    if (packageRelationship.getTargetMode() == TargetMode.INTERNAL) {
+                        PackagePartName relName = PackagingURIHelper.createPartName(packageRelationship.getTargetURI());
+                        parts.add(packageRelationship.getPackage().getPart(relName));
+                    }
+                }
+            } catch (InvalidFormatException e) {
+
+            }
+        }
+        //and slide of course
+        parts.add(slidePart);
+
+    }
+
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/4f04b6c3/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
index 6c3c8d5..9cc300b 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
@@ -170,21 +170,22 @@ public class OOXMLContainerExtractionTest extends AbstractPOIContainerExtraction
         assertEquals(23 + 1 /*thumbnail */, handler.filenames.size());
         assertEquals(23 + 1 /*thumbnail */, handler.mediaTypes.size());
 
+
         assertEquals(TYPE_PPTX, handler.mediaTypes.get(0)); // Embedded office doc
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(1));  //   PNG inside .pptx
-        assertEquals(TYPE_GIF, handler.mediaTypes.get(2));  //   PNG inside .pptx
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(3));  //   PNG inside .pptx
-        assertEquals(TYPE_XLSX, handler.mediaTypes.get(4)); //   .xlsx inside .pptx
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); //     PNG inside .xlsx inside .pptx
-        assertEquals(TYPE_DOCX, handler.mediaTypes.get(6)); //   .docx inside .pptx
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(7)); //     PNG inside .docx inside .pptx
-        assertEquals(TYPE_JPG, handler.mediaTypes.get(8)); //     JPG inside .docx inside .pptx
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); //     PNG inside .docx inside .pptx
-        assertEquals(TYPE_DOC, handler.mediaTypes.get(10)); //   .doc inside .pptx
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(11)); //    PNG inside .doc inside .pptx
-        assertEquals(TYPE_EMF, handler.mediaTypes.get(12)); //   Icon of item inside .pptx
-        assertEquals(TYPE_EMF, handler.mediaTypes.get(13)); //   Icon of item inside .pptx
-        assertEquals(TYPE_EMF, handler.mediaTypes.get(14)); //   Icon of item inside .pptx
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); //   Icon of item inside .pptx
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(2)); //   Icon of item inside .pptx
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(3)); //   Icon of item inside .pptx
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(4));  //   PNG inside .pptx
+        assertEquals(TYPE_GIF, handler.mediaTypes.get(5));  //   PNG inside .pptx
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(6));  //   PNG inside .pptx
+        assertEquals(TYPE_XLSX, handler.mediaTypes.get(7)); //   .xlsx inside .pptx
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(8)); //     PNG inside .xlsx inside .pptx
+        assertEquals(TYPE_DOCX, handler.mediaTypes.get(9)); //   .docx inside .pptx
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(10)); //     PNG inside .docx inside .pptx
+        assertEquals(TYPE_JPG, handler.mediaTypes.get(11)); //     JPG inside .docx inside .pptx
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(12)); //     PNG inside .docx inside .pptx
+        assertEquals(TYPE_DOC, handler.mediaTypes.get(13)); //   .doc inside .pptx
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(14)); //    PNG inside .doc inside .pptx
         assertEquals(TYPE_JPG, handler.mediaTypes.get(15));  // Embedded thumbnail
         assertEquals(TYPE_DOC, handler.mediaTypes.get(16));  // Embedded office doc
         assertEquals(TYPE_PNG, handler.mediaTypes.get(17));  //   PNG inside .doc
@@ -252,25 +253,27 @@ public class OOXMLContainerExtractionTest extends AbstractPOIContainerExtraction
         assertEquals(9 + 1 /* thumbnail */, handler.mediaTypes.size());
 
         // We don't know their exact filenames
-        assertEquals("image4.png", handler.filenames.get(0));
-        assertEquals("image5.gif", handler.filenames.get(1));
-        assertEquals("image6.png", handler.filenames.get(2));
-        assertEquals("Microsoft_Office_Excel_Worksheet1.xlsx", handler.filenames.get(3));
-        assertEquals("Microsoft_Office_Word_Document2.docx", handler.filenames.get(4));
-        assertEquals("Microsoft_Office_Word_97_-_2003_Document1.doc", handler.filenames.get(5));
-        assertEquals("image1.emf", handler.filenames.get(6));
-        assertEquals("image2.emf", handler.filenames.get(7));
-        assertEquals("image3.emf", handler.filenames.get(8));
+        assertEquals("image1.emf", handler.filenames.get(0));
+        assertEquals("image2.emf", handler.filenames.get(1));
+        assertEquals("image3.emf", handler.filenames.get(2));
+        assertEquals("image4.png", handler.filenames.get(3));
+        assertEquals("image5.gif", handler.filenames.get(4));
+        assertEquals("image6.png", handler.filenames.get(5));
+        assertEquals("Microsoft_Office_Excel_Worksheet1.xlsx", handler.filenames.get(6));
+        assertEquals("Microsoft_Office_Word_Document2.docx", handler.filenames.get(7));
+        assertEquals("Microsoft_Office_Word_97_-_2003_Document1.doc", handler.filenames.get(8));
+        assertEquals("/docProps/thumbnail.jpeg", handler.filenames.get(9));
+
         // But we do know their types
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(0));  // Embedded image
-        assertEquals(TYPE_GIF, handler.mediaTypes.get(1));  // Embedded image
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(2));  // Embedded image
-        assertEquals(TYPE_XLSX, handler.mediaTypes.get(3)); // Embedded office doc
-        assertEquals(TYPE_DOCX, handler.mediaTypes.get(4)); // Embedded office doc
-        assertEquals(TYPE_DOC, handler.mediaTypes.get(5));  // Embedded office doc
-        assertEquals(TYPE_EMF, handler.mediaTypes.get(6));  // Icon of embedded office doc
-        assertEquals(TYPE_EMF, handler.mediaTypes.get(7));  // Icon of embedded office doc
-        assertEquals(TYPE_EMF, handler.mediaTypes.get(8));  // Icon of embedded office doc
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(0));  // Icon of embedded office doc
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(1));  // Icon of embedded office doc
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(2));  // Icon of embedded office doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(3));  // Embedded image
+        assertEquals(TYPE_GIF, handler.mediaTypes.get(4));  // Embedded image
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(5));  // Embedded image
+        assertEquals(TYPE_XLSX, handler.mediaTypes.get(6)); // Embedded office doc
+        assertEquals(TYPE_DOCX, handler.mediaTypes.get(7)); // Embedded office doc
+        assertEquals(TYPE_DOC, handler.mediaTypes.get(8));  // Embedded office doc
     }
 
     @Test

http://git-wip-us.apache.org/repos/asf/tika/blob/4f04b6c3/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index ea936d8..e56462c 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -1220,9 +1220,9 @@ public class OOXMLParserTest extends TikaTest {
     @Test
     public void testEmbeddedPDFInPPTX() throws Exception {
         List<Metadata> metadataList = getRecursiveMetadata("testPPT_embeddedPDF.pptx");
-        Metadata pdfMetadata1 = metadataList.get(2);
+        Metadata pdfMetadata1 = metadataList.get(4);
         assertEquals("application/pdf", pdfMetadata1.get(Metadata.CONTENT_TYPE));
-        Metadata pdfMetadata2 = metadataList.get(4);
+        Metadata pdfMetadata2 = metadataList.get(6);
         assertEquals("application/pdf", pdfMetadata2.get(Metadata.CONTENT_TYPE));
     }
 


[2/2] tika git commit: Merge remote-tracking branch 'origin/2.x' into 2.x

Posted by ta...@apache.org.
Merge remote-tracking branch 'origin/2.x' into 2.x


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/ffb25af1
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/ffb25af1
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/ffb25af1

Branch: refs/heads/2.x
Commit: ffb25af1b12d304908171c9be710e2e5c8e2ffd2
Parents: 4f04b6c d8853fe
Author: tballison <ta...@mitre.org>
Authored: Mon Dec 19 16:08:29 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Mon Dec 19 16:08:29 2016 -0500

----------------------------------------------------------------------
 tika-parser-modules/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
----------------------------------------------------------------------