You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/12/06 14:06:47 UTC
[4/7] tika git commit: TIKA-2191 -- step 4-- add markup for embedded
pics
TIKA-2191 -- step 4-- add markup for embedded pics
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/806eaf8b
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/806eaf8b
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/806eaf8b
Branch: refs/heads/master
Commit: 806eaf8b1802a3a3071a5ae0bdc35c20d6739280
Parents: 1aca10a
Author: tballison <ta...@mitre.org>
Authored: Mon Dec 5 13:28:27 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Tue Dec 6 09:01:34 2016 -0500
----------------------------------------------------------------------
.../ooxml/SXWPFWordExtractorDecorator.java | 47 ++++++++++++--
.../ooxml/xwpf/XWPFDocumentXMLBodyHandler.java | 66 +++++++++++++++++++-
.../ooxml/xwpf/XWPFEventBasedWordExtractor.java | 10 +++
.../ooxml/xwpf/XWPFTikaBodyPartHandler.java | 38 +++++++++++
.../microsoft/ooxml/SXWPFExtractorTest.java | 44 ++++++++++++-
5 files changed, 193 insertions(+), 12 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/806eaf8b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
index ee88f15..8634cd6 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
@@ -16,6 +16,7 @@
*/
package org.apache.tika.parser.microsoft.ooxml;
+import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
@@ -29,6 +30,7 @@ import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackageRelationship;
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
+import org.apache.poi.openxml4j.opc.internal.FileHelper;
import org.apache.poi.xwpf.usermodel.XWPFNumbering;
import org.apache.poi.xwpf.usermodel.XWPFRelation;
import org.apache.tika.exception.TikaException;
@@ -56,10 +58,18 @@ import org.xml.sax.SAXException;
*/
public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
+ private final static String[] EMBEDDED_RELATIONSHIPS = new String[]{
+ RELATION_OLE_OBJECT,
+ RELATION_AUDIO,
+ RELATION_IMAGE,
+ RELATION_PACKAGE,
+ RELATION_OFFICE_DOCUMENT
+ };
private final OPCPackage opcPackage;
private final ParseContext context;
+
public SXWPFWordExtractorDecorator(ParseContext context,
XWPFEventBasedWordExtractor extractor) {
super(context, extractor);
@@ -135,22 +145,22 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
private void handlePart(PackagePart packagePart,
XWPFListManager xwpfListManager, XHTMLContentHandler xhtml) throws IOException, SAXException {
- Map<String, String> hyperlinks = loadHyperlinkRelationships(packagePart);
+ Map<String, String> linkedRelationships = loadLinkedRelationships(packagePart);
try (InputStream stream = packagePart.getInputStream()) {
context.getSAXParser().parse(
new CloseShieldInputStream(stream),
new OfflineContentHandler(new EmbeddedContentHandler(
new XWPFDocumentXMLBodyHandler(
new XWPFTikaBodyPartHandler(xhtml, xwpfListManager,
- context.get(OfficeParserConfig.class)), hyperlinks))));
+ context.get(OfficeParserConfig.class)), linkedRelationships))));
} catch (TikaException e) {
//swallow
}
}
- private Map<String, String> loadHyperlinkRelationships(PackagePart bodyPart) {
- Map<String, String> hyperlinks = new HashMap<>();
+ private Map<String, String> loadLinkedRelationships(PackagePart bodyPart) {
+ Map<String, String> linkedRelationships = new HashMap<>();
try {
PackageRelationshipCollection prc = bodyPart.getRelationshipsByType(XWPFRelation.HYPERLINK.getRelation());
for (int i = 0; i < prc.size(); i++) {
@@ -161,12 +171,37 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
String id = pr.getId();
String url = (pr.getTargetURI() == null) ? null : pr.getTargetURI().toString();
if (id != null && url != null) {
- hyperlinks.put(id, url);
+ linkedRelationships.put(id, url);
+ }
+ }
+
+ for (String rel : EMBEDDED_RELATIONSHIPS) {
+ prc = bodyPart.getRelationshipsByType(rel);
+ for (int i = 0; i < prc.size(); i++) {
+ PackageRelationship pr = prc.getRelationship(i);
+ if (pr == null) {
+ continue;
+ }
+ String id = pr.getId();
+ String uriString = (pr.getTargetURI() == null) ? null : pr.getTargetURI().toString();
+ String fileName = uriString;
+ if (pr.getTargetURI() != null) {
+ try {
+ fileName = FileHelper.getFilename(new File(fileName));
+ } catch (Exception e) {
+ fileName = uriString;
+ }
+ }
+ if (id != null) {
+ fileName = (fileName == null) ? "" : fileName;
+ linkedRelationships.put(id, fileName);
+ }
}
}
+
} catch (InvalidFormatException e) {
}
- return hyperlinks;
+ return linkedRelationships;
}
/*
private XWPFStyles loadStyles(PackagePart packagePart) {
http://git-wip-us.apache.org/repos/asf/tika/blob/806eaf8b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
index 9e5ce6b..2538215 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
@@ -45,6 +45,10 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
private final static String W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
private final static String MC_NS = "http://schemas.openxmlformats.org/markup-compatibility/2006";
+ private final static String O_NS = "urn:schemas-microsoft-com:office:office";
+ private final static String PIC_NS = "http://schemas.openxmlformats.org/drawingml/2006/picture";
+ private final static String DRAWING_MAIN_NS = "http://schemas.openxmlformats.org/drawingml/2006/main";
+
private final static String OFFICE_DOC_RELATIONSHIP_NS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships";
private final static char[] TAB = new char[1];
@@ -55,7 +59,7 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
private final XWPFBodyContentsHandler bodyContentsHandler;
//private final RelationshipsManager relationshipsManager;
- private final Map<String, String> hyperlinks;
+ private final Map<String, String> linkedRelationships;
private final StringBuilder runBuffer = new StringBuilder();
@@ -66,6 +70,11 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
private boolean inNumPr = false;
private boolean inDelText = false;
+ private boolean inPic = false;
+ private String picDescription = null;
+ private String picRId = null;
+ private String picFilename = null;
+
//alternate content can be embedded in itself.
//need to track depth.
//if in alternate, choose fallback, maybe make this configurable?
@@ -78,7 +87,7 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
public XWPFDocumentXMLBodyHandler(XWPFBodyContentsHandler bodyContentsHandler,
Map<String, String> hyperlinks) {
this.bodyContentsHandler = bodyContentsHandler;
- this.hyperlinks = hyperlinks;
+ this.linkedRelationships = hyperlinks;
}
@@ -111,6 +120,39 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
if (inACChoiceDepth > 0) {
return;
}
+ if (uri == null || uri.equals(O_NS)) {
+ if (localName.equals("OLEObject")) {
+ String type = null;
+ String refId = null;
+ //TODO: want to get ProgID?
+ for (int i = 0; i < atts.getLength(); i++) {
+ String attLocalName = atts.getLocalName(i);
+ String attValue = atts.getValue(i);
+ if (attLocalName.equals("Type")) {
+ type = attValue;
+ } else if (OFFICE_DOC_RELATIONSHIP_NS.equals(atts.getURI(i)) && attLocalName.equals("id")) {
+ refId = attValue;
+ }
+ }
+ if ("Embed".equals(type)) {
+ bodyContentsHandler.embeddedOLERef(refId);
+ }
+ }
+ }
+
+ if (uri == null || uri.equals(PIC_NS)) {
+ if ("pic".equals(localName)) {
+ inPic = true;
+ } else if ("cNvPr".equals(localName)) {
+ picDescription = atts.getValue("", "descr");
+ }
+ }
+
+ if (uri == null || uri.equals(DRAWING_MAIN_NS)) {
+ if ("blip".equals(localName)) {
+ picRId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "embed");
+ }
+ }
if (uri == null || uri.equals(W_NS)) {
if (localName.equals("p")) {
@@ -151,7 +193,7 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
String hyperlinkId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
String hyperlink = null;
if (hyperlinkId != null) {
- hyperlink = hyperlinks.get(hyperlinkId);
+ hyperlink = linkedRelationships.get(hyperlinkId);
}
bodyContentsHandler.hyperlinkStart(hyperlink);
} else if (localName.equals("footnoteReference")) {
@@ -203,6 +245,20 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
inACFallbackDepth--;
}
}
+
+ if (PIC_NS.equals(uri)) {
+ if ("pic".equals(localName)) {
+ String picFileName = null;
+ if (picRId != null) {
+ picFileName = linkedRelationships.get(picRId);
+ }
+ bodyContentsHandler.embeddedPicRef(picFileName, picDescription);
+ picDescription = null;
+ picRId = null;
+ inPic = false;
+ }
+
+ }
if (uri == null || uri.equals(W_NS)) {
if (inACChoiceDepth > 0) {
return;
@@ -309,5 +365,9 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
void endnoteReference(String id);
boolean getIncludeMoveFromText();
+
+ void embeddedOLERef(String refId);
+
+ void embeddedPicRef(String picFileName, String picDescription);
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/806eaf8b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
index 4ee7a4f..ee6bb85 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
@@ -353,6 +353,16 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
public boolean getIncludeMoveFromText() {
return false;
}
+
+ @Override
+ public void embeddedOLERef(String refId) {
+ //no-op
+ }
+
+ @Override
+ public void embeddedPicRef(String picFileName, String picDescription) {
+ //no-op
+ }
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/806eaf8b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
index d62e270..cd28583 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
@@ -24,6 +24,7 @@ import org.apache.tika.parser.microsoft.OfficeParserConfig;
import org.apache.tika.parser.microsoft.ooxml.XWPFListManager;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
public class XWPFTikaBodyPartHandler implements XWPFDocumentXMLBodyHandler.XWPFBodyContentsHandler {
@@ -248,6 +249,43 @@ public class XWPFTikaBodyPartHandler implements XWPFDocumentXMLBodyHandler.XWPFB
return includeMoveFromText;
}
+ @Override
+ public void embeddedOLERef(String relId) {
+ if (relId == null) {
+ return;
+ }
+ try {
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+ attributes.addAttribute("", "id", "id", "CDATA", relId);
+ xhtml.startElement("div", attributes);
+ xhtml.endElement("div");
+
+ } catch (SAXException e) {
+
+ }
+ }
+
+ @Override
+ public void embeddedPicRef(String picFileName, String picDescription) {
+
+ try {
+ AttributesImpl attr = new AttributesImpl();
+ if (picFileName != null) {
+ attr.addAttribute("", "src", "src", "CDATA", "embedded:" + picFileName);
+ }
+ if (picDescription != null) {
+ attr.addAttribute("", "alt", "alt", "CDATA", picDescription);
+ }
+
+ xhtml.startElement("img", attr);
+ xhtml.endElement("img");
+
+ } catch (SAXException e) {
+
+ }
+ }
+
private void closeStyleTags() throws SAXException {
if (isItalics) {
xhtml.endElement("i");
http://git-wip-us.apache.org/repos/asf/tika/blob/806eaf8b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
index 22e5644..f4a1aeb 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
@@ -22,6 +22,7 @@ import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.io.ByteArrayOutputStream;
+import java.io.File;
import java.io.InputStream;
import java.io.PrintStream;
import java.util.Arrays;
@@ -38,6 +39,7 @@ import org.apache.tika.metadata.OfficeOpenXMLCore;
import org.apache.tika.metadata.OfficeOpenXMLExtended;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.PasswordProvider;
@@ -418,13 +420,13 @@ public class SXWPFExtractorTest extends TikaTest {
// TIKA-989:
@Test
- @Ignore("TODO")
public void testEmbeddedPDF() throws Exception {
String xml = getXML("testWORD_embedded_pdf.docx", parseContext).xml;
+ System.out.println(xml);
int i = xml.indexOf("Here is the pdf file:");
- int j = xml.indexOf("<div class=\"embedded\" id=\"rId5\"/>");
+ int j = xml.indexOf("<div class=\"embedded\" id=\"rId5\" />");
int k = xml.indexOf("Bye Bye");
- int l = xml.indexOf("<div class=\"embedded\" id=\"rId6\"/>");
+ int l = xml.indexOf("<div class=\"embedded\" id=\"rId6\" />");
int m = xml.indexOf("Bye for real.");
assertTrue(i != -1);
assertTrue(j != -1);
@@ -696,5 +698,41 @@ public class SXWPFExtractorTest extends TikaTest {
assertContainsAtLeast(minExpected, metadataList);//, parseContext));
}
+ @Test
+ public void testEmbedded() throws Exception {
+ List<Metadata> metadataList = getRecursiveMetadata("testWORD_embeded.docx", parseContext);
+ Metadata main = metadataList.get(0);
+ String content = main.get(RecursiveParserWrapper.TIKA_CONTENT);
+ //make sure mark up is there
+ assertContains("<img src=\"embedded:image2.jpeg\" alt=\"A description...\" />",
+ content);
+
+ assertContains("<div class=\"embedded\" id=\"rId8\" />",
+ content);
+
+ assertEquals(16, metadataList.size());
+ }
+
+ @Test
+ public void iterate() throws Exception {
+ ParseContext context = new ParseContext();
+ context.set(Parser.class, EmptyParser.INSTANCE);
+ for (File f : getResourceAsFile("/test-documents").listFiles()) {
+ if (! f.getName().equals("testWORD_embeded.docx")) {
+ continue;
+ }
+ if (f.getName().endsWith("docx") || f.getName().endsWith(".docm")) {
+ try {
+ XMLResult r = getXML(f.getName(), context);
+ if (r.xml.contains("<img")) {
+ System.out.println(f.getName());
+ }
+ System.out.println(r.xml);
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+ }
+ }
}