You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/12/19 21:06:22 UTC
tika git commit: TIKA-2218 -- add a few more places where .pptx can
include embedded objects
Repository: tika
Updated Branches:
refs/heads/master 90cdf1f6a -> ca37313a7
TIKA-2218 -- add a few more places where .pptx can include embedded objects
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/ca37313a
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/ca37313a
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/ca37313a
Branch: refs/heads/master
Commit: ca37313a716d4eaa3a15a4ba770f89ee23832e99
Parents: 90cdf1f
Author: tballison <ta...@mitre.org>
Authored: Mon Dec 19 16:06:13 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Mon Dec 19 16:06:13 2016 -0500
----------------------------------------------------------------------
.../SXSLFPowerPointExtractorDecorator.java | 52 ++++++++++++---
.../ooxml/XSLFPowerPointExtractorDecorator.java | 60 ++++++++++++++----
.../ooxml/OOXMLContainerExtractionTest.java | 67 ++++++++++----------
.../parser/microsoft/ooxml/OOXMLParserTest.java | 4 +-
.../microsoft/ooxml/SXSLFExtractorTest.java | 2 +-
5 files changed, 130 insertions(+), 55 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/ca37313a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
index 1ab8bd3..21577c4 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
@@ -251,21 +251,57 @@ public class SXSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
PackageRelationshipCollection prc = mainDocument.getRelationshipsByType(XSLFRelation.SLIDE.getRelation());
for (int i = 0; i < prc.size(); i++) {
PackagePart slidePart = mainDocument.getRelatedPart(prc.getRelationship(i));
- for (PackageRelationship rel : slidePart.getRelationshipsByType(XSLFRelation.VML_DRAWING.getRelation())) {
- if (rel.getTargetMode() == TargetMode.INTERNAL) {
- PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
- parts.add(rel.getPackage().getPart(relName));
- }
- }
- parts.add(slidePart);
+ addSlideParts(slidePart, parts);
}
} catch (InvalidFormatException e) {
- //do something
+ //log
}
+
parts.add(mainDocument);
+ for (String rel : new String[]{
+ XSLFRelation.SLIDE_MASTER.getRelation(),
+ HANDOUT_MASTER}) {
+ try {
+ PackageRelationshipCollection prc = mainDocument.getRelationshipsByType(rel);
+ for (int i = 0; i < prc.size(); i++) {
+ PackagePart pp = mainDocument.getRelatedPart(prc.getRelationship(i));
+ if (pp != null) {
+ parts.add(pp);
+ }
+ }
+
+ } catch (InvalidFormatException e) {
+ //log
+ }
+ }
+
return parts;
}
+ private void addSlideParts(PackagePart slidePart, List<PackagePart> parts) {
+
+ for (String relation : new String[]{
+ XSLFRelation.VML_DRAWING.getRelation(),
+ XSLFRelation.SLIDE_LAYOUT.getRelation(),
+ XSLFRelation.NOTES_MASTER.getRelation(),
+ XSLFRelation.NOTES.getRelation()
+ }) {
+ try {
+ for (PackageRelationship packageRelationship : slidePart.getRelationshipsByType(relation)) {
+ if (packageRelationship.getTargetMode() == TargetMode.INTERNAL) {
+ PackagePartName relName = PackagingURIHelper.createPartName(packageRelationship.getTargetURI());
+ parts.add(packageRelationship.getPackage().getPart(relName));
+ }
+ }
+ } catch (InvalidFormatException e) {
+
+ }
+ }
+ //and slide of course
+ parts.add(slidePart);
+
+ }
+
private class XSLFCommentsHandler extends DefaultHandler {
private String commentAuthorId = null;
http://git-wip-us.apache.org/repos/asf/tika/blob/ca37313a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
index 0bf0ad3..d2bacb3 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
@@ -26,6 +26,7 @@ import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackagePartName;
import org.apache.poi.openxml4j.opc.PackageRelationship;
+import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.openxml4j.opc.PackagingURIHelper;
import org.apache.poi.openxml4j.opc.TargetMode;
import org.apache.poi.sl.usermodel.Placeholder;
@@ -45,6 +46,10 @@ import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
+
+ private final static String HANDOUT_MASTER = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/handoutMaster";
+
+
public XSLFPowerPointExtractorDecorator(ParseContext context, XSLFPowerPointExtractor extractor) {
super(context, extractor);
}
@@ -267,24 +272,55 @@ public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
} catch (XmlException xe) {
throw new TikaException("Broken OOXML file", xe);
}
- parts.add(slidePart);
+ addSlideParts(slidePart, parts);
+ }
+ }
+ //add full document to include macros
+ parts.add(document.getPackagePart());
- // If it has drawings, return those too
- try {
- for (PackageRelationship rel : slidePart.getRelationshipsByType(XSLFRelation.VML_DRAWING.getRelation())) {
- if (rel.getTargetMode() == TargetMode.INTERNAL) {
- PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
- parts.add(rel.getPackage().getPart(relName));
- }
+ for (String rel : new String[]{
+ XSLFRelation.SLIDE_MASTER.getRelation(),
+ HANDOUT_MASTER}) {
+ try {
+ PackageRelationshipCollection prc = document.getPackagePart().getRelationshipsByType(rel);
+ for (int i = 0; i < prc.size(); i++) {
+ PackagePart pp = document.getPackagePart().getRelatedPart(prc.getRelationship(i));
+ if (pp != null) {
+ parts.add(pp);
}
- } catch (InvalidFormatException e) {
- throw new TikaException("Broken OOXML file", e);
}
+
+ } catch (InvalidFormatException e) {
+ //log
}
}
- //add full document to include macros
- parts.add(document.getPackagePart());
return parts;
}
+
+
+ private void addSlideParts(PackagePart slidePart, List<PackagePart> parts) {
+
+ for (String relation : new String[]{
+ XSLFRelation.VML_DRAWING.getRelation(),
+ XSLFRelation.SLIDE_LAYOUT.getRelation(),
+ XSLFRelation.NOTES_MASTER.getRelation(),
+ XSLFRelation.NOTES.getRelation()
+ }) {
+ try {
+ for (PackageRelationship packageRelationship : slidePart.getRelationshipsByType(relation)) {
+ if (packageRelationship.getTargetMode() == TargetMode.INTERNAL) {
+ PackagePartName relName = PackagingURIHelper.createPartName(packageRelationship.getTargetURI());
+ parts.add(packageRelationship.getPackage().getPart(relName));
+ }
+ }
+ } catch (InvalidFormatException e) {
+
+ }
+ }
+ //and slide of course
+ parts.add(slidePart);
+
+ }
+
}
http://git-wip-us.apache.org/repos/asf/tika/blob/ca37313a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
index 6c3c8d5..9cc300b 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
@@ -170,21 +170,22 @@ public class OOXMLContainerExtractionTest extends AbstractPOIContainerExtraction
assertEquals(23 + 1 /*thumbnail */, handler.filenames.size());
assertEquals(23 + 1 /*thumbnail */, handler.mediaTypes.size());
+
assertEquals(TYPE_PPTX, handler.mediaTypes.get(0)); // Embedded office doc
- assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); // PNG inside .pptx
- assertEquals(TYPE_GIF, handler.mediaTypes.get(2)); // PNG inside .pptx
- assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // PNG inside .pptx
- assertEquals(TYPE_XLSX, handler.mediaTypes.get(4)); // .xlsx inside .pptx
- assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // PNG inside .xlsx inside .pptx
- assertEquals(TYPE_DOCX, handler.mediaTypes.get(6)); // .docx inside .pptx
- assertEquals(TYPE_PNG, handler.mediaTypes.get(7)); // PNG inside .docx inside .pptx
- assertEquals(TYPE_JPG, handler.mediaTypes.get(8)); // JPG inside .docx inside .pptx
- assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // PNG inside .docx inside .pptx
- assertEquals(TYPE_DOC, handler.mediaTypes.get(10)); // .doc inside .pptx
- assertEquals(TYPE_PNG, handler.mediaTypes.get(11)); // PNG inside .doc inside .pptx
- assertEquals(TYPE_EMF, handler.mediaTypes.get(12)); // Icon of item inside .pptx
- assertEquals(TYPE_EMF, handler.mediaTypes.get(13)); // Icon of item inside .pptx
- assertEquals(TYPE_EMF, handler.mediaTypes.get(14)); // Icon of item inside .pptx
+ assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of item inside .pptx
+ assertEquals(TYPE_EMF, handler.mediaTypes.get(2)); // Icon of item inside .pptx
+ assertEquals(TYPE_EMF, handler.mediaTypes.get(3)); // Icon of item inside .pptx
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(4)); // PNG inside .pptx
+ assertEquals(TYPE_GIF, handler.mediaTypes.get(5)); // PNG inside .pptx
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(6)); // PNG inside .pptx
+ assertEquals(TYPE_XLSX, handler.mediaTypes.get(7)); // .xlsx inside .pptx
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(8)); // PNG inside .xlsx inside .pptx
+ assertEquals(TYPE_DOCX, handler.mediaTypes.get(9)); // .docx inside .pptx
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(10)); // PNG inside .docx inside .pptx
+ assertEquals(TYPE_JPG, handler.mediaTypes.get(11)); // JPG inside .docx inside .pptx
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(12)); // PNG inside .docx inside .pptx
+ assertEquals(TYPE_DOC, handler.mediaTypes.get(13)); // .doc inside .pptx
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(14)); // PNG inside .doc inside .pptx
assertEquals(TYPE_JPG, handler.mediaTypes.get(15)); // Embedded thumbnail
assertEquals(TYPE_DOC, handler.mediaTypes.get(16)); // Embedded office doc
assertEquals(TYPE_PNG, handler.mediaTypes.get(17)); // PNG inside .doc
@@ -252,25 +253,27 @@ public class OOXMLContainerExtractionTest extends AbstractPOIContainerExtraction
assertEquals(9 + 1 /* thumbnail */, handler.mediaTypes.size());
// We don't know their exact filenames
- assertEquals("image4.png", handler.filenames.get(0));
- assertEquals("image5.gif", handler.filenames.get(1));
- assertEquals("image6.png", handler.filenames.get(2));
- assertEquals("Microsoft_Office_Excel_Worksheet1.xlsx", handler.filenames.get(3));
- assertEquals("Microsoft_Office_Word_Document2.docx", handler.filenames.get(4));
- assertEquals("Microsoft_Office_Word_97_-_2003_Document1.doc", handler.filenames.get(5));
- assertEquals("image1.emf", handler.filenames.get(6));
- assertEquals("image2.emf", handler.filenames.get(7));
- assertEquals("image3.emf", handler.filenames.get(8));
+ assertEquals("image1.emf", handler.filenames.get(0));
+ assertEquals("image2.emf", handler.filenames.get(1));
+ assertEquals("image3.emf", handler.filenames.get(2));
+ assertEquals("image4.png", handler.filenames.get(3));
+ assertEquals("image5.gif", handler.filenames.get(4));
+ assertEquals("image6.png", handler.filenames.get(5));
+ assertEquals("Microsoft_Office_Excel_Worksheet1.xlsx", handler.filenames.get(6));
+ assertEquals("Microsoft_Office_Word_Document2.docx", handler.filenames.get(7));
+ assertEquals("Microsoft_Office_Word_97_-_2003_Document1.doc", handler.filenames.get(8));
+ assertEquals("/docProps/thumbnail.jpeg", handler.filenames.get(9));
+
// But we do know their types
- assertEquals(TYPE_PNG, handler.mediaTypes.get(0)); // Embedded image
- assertEquals(TYPE_GIF, handler.mediaTypes.get(1)); // Embedded image
- assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embedded image
- assertEquals(TYPE_XLSX, handler.mediaTypes.get(3)); // Embedded office doc
- assertEquals(TYPE_DOCX, handler.mediaTypes.get(4)); // Embedded office doc
- assertEquals(TYPE_DOC, handler.mediaTypes.get(5)); // Embedded office doc
- assertEquals(TYPE_EMF, handler.mediaTypes.get(6)); // Icon of embedded office doc
- assertEquals(TYPE_EMF, handler.mediaTypes.get(7)); // Icon of embedded office doc
- assertEquals(TYPE_EMF, handler.mediaTypes.get(8)); // Icon of embedded office doc
+ assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc
+ assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embedded office doc
+ assertEquals(TYPE_EMF, handler.mediaTypes.get(2)); // Icon of embedded office doc
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // Embedded image
+ assertEquals(TYPE_GIF, handler.mediaTypes.get(4)); // Embedded image
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // Embedded image
+ assertEquals(TYPE_XLSX, handler.mediaTypes.get(6)); // Embedded office doc
+ assertEquals(TYPE_DOCX, handler.mediaTypes.get(7)); // Embedded office doc
+ assertEquals(TYPE_DOC, handler.mediaTypes.get(8)); // Embedded office doc
}
@Test
http://git-wip-us.apache.org/repos/asf/tika/blob/ca37313a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index e84f6d0..6464e79 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -1235,9 +1235,9 @@ public class OOXMLParserTest extends TikaTest {
@Test
public void testEmbeddedPDFInPPTX() throws Exception {
List<Metadata> metadataList = getRecursiveMetadata("testPPT_EmbeddedPDF.pptx");
- Metadata pdfMetadata1 = metadataList.get(2);
+ Metadata pdfMetadata1 = metadataList.get(4);
assertContains("Apache Tika", pdfMetadata1.get(RecursiveParserWrapper.TIKA_CONTENT));
- Metadata pdfMetadata2 = metadataList.get(4);
+ Metadata pdfMetadata2 = metadataList.get(5);
assertContains("Hello World", pdfMetadata2.get(RecursiveParserWrapper.TIKA_CONTENT));
}
http://git-wip-us.apache.org/repos/asf/tika/blob/ca37313a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
index d8df4c9..305b2e4 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
@@ -63,7 +63,7 @@ public class SXSLFExtractorTest extends TikaTest {
List<Metadata> metadataList = getRecursiveMetadata("testPPT_various2.pptx", parseContext);
- assertEquals("right number of attachments", 10, metadataList.size());
+ assertEquals("right number of attachments", 14, metadataList.size());
String mainContent = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);