You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/12/06 14:06:48 UTC
[5/7] tika git commit: TIKA-2191 -- step 5 actually extract images
embedded in areas besides the body of docx/m
TIKA-2191 -- step 5 actually extract images embedded in areas besides the body of docx/m
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/4469ca2c
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/4469ca2c
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/4469ca2c
Branch: refs/heads/master
Commit: 4469ca2c4ea725e9f5d94c116aaf248deea2a6eb
Parents: 806eaf8
Author: tballison <ta...@mitre.org>
Authored: Tue Dec 6 08:43:59 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Tue Dec 6 09:01:48 2016 -0500
----------------------------------------------------------------------
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 11 +++-
.../ooxml/SXWPFWordExtractorDecorator.java | 47 ++++++++++++++--
.../ooxml/xwpf/XWPFDocumentXMLBodyHandler.java | 32 ++++++++---
.../parser/microsoft/ooxml/OOXMLParserTest.java | 54 ++++++++++++-------
.../microsoft/ooxml/SXWPFExtractorTest.java | 35 ++++++++++--
.../test-documents/testWORD_embedded_pics.docx | Bin 0 -> 52399 bytes
6 files changed, 142 insertions(+), 37 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/4469ca2c/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index f9ba8a6..6bc867d 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -22,7 +22,9 @@ import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
+import java.util.HashSet;
import java.util.List;
+import java.util.Set;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
@@ -158,10 +160,17 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
private void handleEmbeddedParts(ContentHandler handler)
throws TikaException, IOException, SAXException {
+ Set<String> seen = new HashSet<>();
try {
for (PackagePart source : getMainDocumentParts()) {
for (PackageRelationship rel : source.getRelationships()) {
-
+ URI targetURI = rel.getTargetURI();
+ if (targetURI != null) {
+ if (seen.contains(targetURI.toString())) {
+ continue;
+ }
+ seen.add(targetURI.toString());
+ }
URI sourceURI = rel.getSourceURI();
String sourceDesc;
if (sourceURI != null) {
http://git-wip-us.apache.org/repos/asf/tika/blob/4469ca2c/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
index 8634cd6..43fca3b 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
@@ -19,6 +19,7 @@ package org.apache.tika.parser.microsoft.ooxml;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
+import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@@ -59,13 +60,21 @@ import org.xml.sax.SAXException;
public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
private final static String[] EMBEDDED_RELATIONSHIPS = new String[]{
- RELATION_OLE_OBJECT,
RELATION_AUDIO,
RELATION_IMAGE,
RELATION_PACKAGE,
RELATION_OFFICE_DOCUMENT
};
+ //include all parts that might have embedded objects
+ private final static String[] MAIN_PART_RELATIONS = new String[]{
+ XWPFRelation.HEADER.getRelation(),
+ XWPFRelation.FOOTER.getRelation(),
+ XWPFRelation.FOOTNOTE.getRelation(),
+ "http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes",
+ "http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments"
+ };
+
private final OPCPackage opcPackage;
private final ParseContext context;
@@ -82,7 +91,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
protected void buildXHTML(XHTMLContentHandler xhtml)
throws SAXException, XmlException, IOException {
//handle main document
- List<PackagePart> pps = getMainDocumentParts();
+ List<PackagePart> pps = getStoryDocumentParts();
if (pps != null) {
for (PackagePart pp : pps) {
//likely only one, but why not...
@@ -176,6 +185,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
}
for (String rel : EMBEDDED_RELATIONSHIPS) {
+
prc = bodyPart.getRelationshipsByType(rel);
for (int i = 0; i < prc.size(); i++) {
PackageRelationship pr = prc.getRelationship(i);
@@ -247,11 +257,40 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
}
/**
- * This returns the main document only.
+ * This returns all items that might contain embedded objects:
+ * main document, headers, footers, comments, etc.
*/
@Override
protected List<PackagePart> getMainDocumentParts() {
- //figure out which one this is
+
+ List<PackagePart> mainStoryDocs = getStoryDocumentParts();
+ List<PackagePart> relatedParts = new ArrayList<>();
+
+ for (PackagePart pp : mainStoryDocs) {
+ addRelatedParts(pp, relatedParts);
+ }
+ relatedParts.addAll(mainStoryDocs);
+ return relatedParts;
+ }
+
+ private void addRelatedParts(PackagePart documentPart, List<PackagePart> relatedParts) {
+ for (String relation : MAIN_PART_RELATIONS) {
+ PackageRelationshipCollection prc = null;
+ try {
+ prc = documentPart.getRelationshipsByType(relation);
+ if (prc != null) {
+ for (int i = 0; i < prc.size(); i++) {
+ PackagePart packagePart = documentPart.getRelatedPart(prc.getRelationship(i));
+ relatedParts.add(packagePart);
+ }
+ }
+ } catch (InvalidFormatException e) {
+ }
+ }
+
+ }
+
+ private List<PackagePart> getStoryDocumentParts() {
List<PackagePart> pps = opcPackage.getPartsByContentType(XWPFRelation.DOCUMENT.getContentType());
if (pps.size() == 0) {
pps = opcPackage.getPartsByContentType(XWPFRelation.MACRO_DOCUMENT.getContentType());
http://git-wip-us.apache.org/repos/asf/tika/blob/4469ca2c/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
index 2538215..b2e74d1 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
@@ -34,6 +34,7 @@ import org.xml.sax.helpers.DefaultHandler;
public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
+
enum EditType {
NONE,
INSERT,
@@ -48,6 +49,7 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
private final static String O_NS = "urn:schemas-microsoft-com:office:office";
private final static String PIC_NS = "http://schemas.openxmlformats.org/drawingml/2006/picture";
private final static String DRAWING_MAIN_NS = "http://schemas.openxmlformats.org/drawingml/2006/main";
+ private static final String V_NS = "urn:schemas-microsoft-com:vml";
private final static String OFFICE_DOC_RELATIONSHIP_NS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships";
@@ -71,6 +73,7 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
private boolean inDelText = false;
private boolean inPic = false;
+ private boolean inPict = false;
private String picDescription = null;
private String picRId = null;
private String picFilename = null;
@@ -154,6 +157,13 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
}
}
+ if (uri == null || uri.equals(V_NS)) {
+ if ("imagedata".equals(localName)) {
+ picRId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
+ picDescription = atts.getValue(O_NS, "title");
+ }
+ }
+
if (uri == null || uri.equals(W_NS)) {
if (localName.equals("p")) {
bodyContentsHandler.startParagraph();
@@ -248,14 +258,9 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
if (PIC_NS.equals(uri)) {
if ("pic".equals(localName)) {
- String picFileName = null;
- if (picRId != null) {
- picFileName = linkedRelationships.get(picRId);
- }
- bodyContentsHandler.embeddedPicRef(picFileName, picDescription);
- picDescription = null;
- picRId = null;
+ handlePict();
inPic = false;
+ return;
}
}
@@ -291,10 +296,23 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
editType = EditType.NONE;
} else if (localName.equals("hyperlink")) {
bodyContentsHandler.hyperlinkEnd();
+ } else if ("pict".equals(localName)) {
+ handlePict();
}
}
}
+ private void handlePict() {
+ String picFileName = null;
+ if (picRId != null) {
+ picFileName = linkedRelationships.get(picRId);
+ }
+ bodyContentsHandler.embeddedPicRef(picFileName, picDescription);
+ picDescription = null;
+ picRId = null;
+ inPic = false;
+ }
+
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
http://git-wip-us.apache.org/repos/asf/tika/blob/4469ca2c/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 0059d09..a831006 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -468,30 +468,44 @@ public class OOXMLParserTest extends TikaTest {
* Test that we can extract image from docx header
*/
@Test
+ @Ignore("fix actual extraction")
public void testWordPicturesInHeader() throws Exception {
- Metadata metadata = new Metadata();
- ParseContext context = new ParseContext();
-
- StringWriter sw = new StringWriter();
- SAXTransformerFactory factory = (SAXTransformerFactory)
- SAXTransformerFactory.newInstance();
- TransformerHandler handler = factory.newTransformerHandler();
- handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
- handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
- handler.setResult(new StreamResult(sw));
+ List<Metadata> metadataList = getRecursiveMetadata("headerPic.docx");
+ assertEquals(2, metadataList.size());
+ Metadata m = metadataList.get(0);
+ String mainContent = m.get(RecursiveParserWrapper.TIKA_CONTENT);
+ assertEquals(
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ m.get(Metadata.CONTENT_TYPE));
+ // Check that custom headings came through
+ assertTrue(mainContent.contains("<img"));
+ }
- // Try with a document containing various tables and formattings
- try (InputStream input = getTestDocument("headerPic.docx")) {
- parser.parse(input, handler, metadata, context);
- String xml = sw.toString();
- assertEquals(
- "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
- metadata.get(Metadata.CONTENT_TYPE));
- // Check that custom headings came through
- assertTrue(xml.contains("<img"));
+ @Test
+ @Ignore("not currently extracting from non-body components")
+ public void testPicturesInVariousPlaces() throws Exception {
+ //test that images are actually extracted from
+ //headers, footers, comments, endnotes, footnotes
+ List<Metadata> metadataList = getRecursiveMetadata("testWORD_embedded_pics.docx");
+
+ //only process embedded resources once
+ assertEquals(3, metadataList.size());
+ String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
+ for (int i = 1; i < 4; i++) {
+ assertContains("header"+i+"_pic", content);
+ assertContains("footer"+i+"_pic", content);
}
+ assertContains("body_pic.jpg", content);
+ assertContains("sdt_pic.jpg", content);
+ assertContains("deeply_embedded_pic", content);
+ assertContains("deleted_pic", content);//TODO: don't extract this
+ assertContains("footnotes_pic", content);
+ assertContains("comments_pic", content);
+ assertContains("endnotes_pic", content);
+// assertContains("sdt2_pic.jpg", content);//name of file is not stored in image-sdt
+
+ assertContainsCount("<img src=", content, 14);
}
-
/**
* Documents with some sheets are protected, but not all.
* See TIKA-364.
http://git-wip-us.apache.org/repos/asf/tika/blob/4469ca2c/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
index f4a1aeb..6064be2 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
@@ -290,16 +290,41 @@ public class SXWPFExtractorTest extends TikaTest {
* Test that we can extract image from docx header
*/
@Test
- @Ignore("TODO")
public void testWordPicturesInHeader() throws Exception {
- assertEquals(2, getRecursiveMetadata("headerPic.docx").size());
- XMLResult xmlResult = getXML("headerPic.docx", parseContext);
+ List<Metadata> metadataList = getRecursiveMetadata("headerPic.docx", parseContext);
+ assertEquals(2, metadataList.size());
+ Metadata m = metadataList.get(0);
+ String mainContent = m.get(RecursiveParserWrapper.TIKA_CONTENT);
assertEquals(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
- xmlResult.metadata.get(Metadata.CONTENT_TYPE));
+ m.get(Metadata.CONTENT_TYPE));
// Check that custom headings came through
- assertTrue(xmlResult.xml.contains("<img"));
+ assertTrue(mainContent.contains("<img"));
+ }
+ @Test
+ public void testPicturesInVariousPlaces() throws Exception {
+ //test that images are actually extracted from
+ //headers, footers, comments, endnotes, footnotes
+ List<Metadata> metadataList = getRecursiveMetadata("testWORD_embedded_pics.docx", parseContext);
+
+ //only process embedded resources once
+ assertEquals(3, metadataList.size());
+ String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
+ for (int i = 1; i < 4; i++) {
+ assertContains("header"+i+"_pic", content);
+ assertContains("footer"+i+"_pic", content);
+ }
+ assertContains("body_pic.jpg", content);
+ assertContains("sdt_pic.jpg", content);
+ assertContains("deeply_embedded_pic", content);
+ assertContains("deleted_pic", content);//TODO: don't extract this
+ assertContains("footnotes_pic", content);
+ assertContains("comments_pic", content);
+ assertContains("endnotes_pic", content);
+// assertContains("sdt2_pic.jpg", content);//name of file is not stored in image-sdt
+
+ assertContainsCount("<img src=", content, 14);
}
/**
http://git-wip-us.apache.org/repos/asf/tika/blob/4469ca2c/tika-parsers/src/test/resources/test-documents/testWORD_embedded_pics.docx
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testWORD_embedded_pics.docx b/tika-parsers/src/test/resources/test-documents/testWORD_embedded_pics.docx
new file mode 100644
index 0000000..1a63e6f
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testWORD_embedded_pics.docx differ