You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/12/06 14:06:47 UTC

[4/7] tika git commit: TIKA-2191 -- step 4-- add markup for embedded pics

TIKA-2191 -- step 4-- add markup for embedded pics


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/806eaf8b
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/806eaf8b
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/806eaf8b

Branch: refs/heads/master
Commit: 806eaf8b1802a3a3071a5ae0bdc35c20d6739280
Parents: 1aca10a
Author: tballison <ta...@mitre.org>
Authored: Mon Dec 5 13:28:27 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Tue Dec 6 09:01:34 2016 -0500

----------------------------------------------------------------------
 .../ooxml/SXWPFWordExtractorDecorator.java      | 47 ++++++++++++--
 .../ooxml/xwpf/XWPFDocumentXMLBodyHandler.java  | 66 +++++++++++++++++++-
 .../ooxml/xwpf/XWPFEventBasedWordExtractor.java | 10 +++
 .../ooxml/xwpf/XWPFTikaBodyPartHandler.java     | 38 +++++++++++
 .../microsoft/ooxml/SXWPFExtractorTest.java     | 44 ++++++++++++-
 5 files changed, 193 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/806eaf8b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
index ee88f15..8634cd6 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
@@ -16,6 +16,7 @@
  */
 package org.apache.tika.parser.microsoft.ooxml;
 
+import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.HashMap;
@@ -29,6 +30,7 @@ import org.apache.poi.openxml4j.opc.OPCPackage;
 import org.apache.poi.openxml4j.opc.PackagePart;
 import org.apache.poi.openxml4j.opc.PackageRelationship;
 import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
+import org.apache.poi.openxml4j.opc.internal.FileHelper;
 import org.apache.poi.xwpf.usermodel.XWPFNumbering;
 import org.apache.poi.xwpf.usermodel.XWPFRelation;
 import org.apache.tika.exception.TikaException;
@@ -56,10 +58,18 @@ import org.xml.sax.SAXException;
  */
 public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
 
+    private final static String[] EMBEDDED_RELATIONSHIPS = new String[]{
+            RELATION_OLE_OBJECT,
+            RELATION_AUDIO,
+            RELATION_IMAGE,
+            RELATION_PACKAGE,
+            RELATION_OFFICE_DOCUMENT
+    };
 
     private final OPCPackage opcPackage;
     private final ParseContext context;
 
+
     public SXWPFWordExtractorDecorator(ParseContext context,
                                        XWPFEventBasedWordExtractor extractor) {
         super(context, extractor);
@@ -135,22 +145,22 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
     private void handlePart(PackagePart packagePart,
                             XWPFListManager xwpfListManager, XHTMLContentHandler xhtml) throws IOException, SAXException {
 
-        Map<String, String> hyperlinks = loadHyperlinkRelationships(packagePart);
+        Map<String, String> linkedRelationships = loadLinkedRelationships(packagePart);
         try (InputStream stream = packagePart.getInputStream()) {
             context.getSAXParser().parse(
                     new CloseShieldInputStream(stream),
                     new OfflineContentHandler(new EmbeddedContentHandler(
                             new XWPFDocumentXMLBodyHandler(
                                     new XWPFTikaBodyPartHandler(xhtml, xwpfListManager,
-                                            context.get(OfficeParserConfig.class)), hyperlinks))));
+                                            context.get(OfficeParserConfig.class)), linkedRelationships))));
         } catch (TikaException e) {
             //swallow
         }
 
     }
 
-    private Map<String, String> loadHyperlinkRelationships(PackagePart bodyPart) {
-        Map<String, String> hyperlinks = new HashMap<>();
+    private Map<String, String> loadLinkedRelationships(PackagePart bodyPart) {
+        Map<String, String> linkedRelationships = new HashMap<>();
         try {
             PackageRelationshipCollection prc = bodyPart.getRelationshipsByType(XWPFRelation.HYPERLINK.getRelation());
             for (int i = 0; i < prc.size(); i++) {
@@ -161,12 +171,37 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
                 String id = pr.getId();
                 String url = (pr.getTargetURI() == null) ? null : pr.getTargetURI().toString();
                 if (id != null && url != null) {
-                    hyperlinks.put(id, url);
+                    linkedRelationships.put(id, url);
+                }
+            }
+
+            for (String rel : EMBEDDED_RELATIONSHIPS) {
+                prc = bodyPart.getRelationshipsByType(rel);
+                for (int i = 0; i < prc.size(); i++) {
+                    PackageRelationship pr = prc.getRelationship(i);
+                    if (pr == null) {
+                        continue;
+                    }
+                    String id = pr.getId();
+                    String uriString = (pr.getTargetURI() == null) ? null : pr.getTargetURI().toString();
+                    String fileName = uriString;
+                    if (pr.getTargetURI() != null) {
+                        try {
+                            fileName = FileHelper.getFilename(new File(fileName));
+                        } catch (Exception e) {
+                            fileName = uriString;
+                        }
+                    }
+                    if (id != null) {
+                        fileName = (fileName == null) ? "" : fileName;
+                        linkedRelationships.put(id, fileName);
+                    }
                 }
             }
+
         } catch (InvalidFormatException e) {
         }
-        return hyperlinks;
+        return linkedRelationships;
     }
 /*
     private XWPFStyles loadStyles(PackagePart packagePart) {

http://git-wip-us.apache.org/repos/asf/tika/blob/806eaf8b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
index 9e5ce6b..2538215 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
@@ -45,6 +45,10 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
 
     private final static String W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
     private final static String MC_NS = "http://schemas.openxmlformats.org/markup-compatibility/2006";
+    private final static String O_NS = "urn:schemas-microsoft-com:office:office";
+    private final static String PIC_NS = "http://schemas.openxmlformats.org/drawingml/2006/picture";
+    private final static String DRAWING_MAIN_NS = "http://schemas.openxmlformats.org/drawingml/2006/main";
+
     private final static String OFFICE_DOC_RELATIONSHIP_NS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships";
 
     private final static char[] TAB = new char[1];
@@ -55,7 +59,7 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
 
     private final XWPFBodyContentsHandler bodyContentsHandler;
     //private final RelationshipsManager relationshipsManager;
-    private final Map<String, String> hyperlinks;
+    private final Map<String, String> linkedRelationships;
 
     private final StringBuilder runBuffer = new StringBuilder();
 
@@ -66,6 +70,11 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
     private boolean inNumPr = false;
     private boolean inDelText = false;
 
+    private boolean inPic = false;
+    private String picDescription = null;
+    private String picRId = null;
+    private String picFilename = null;
+
     //alternate content can be embedded in itself.
     //need to track depth.
     //if in alternate, choose fallback, maybe make this configurable?
@@ -78,7 +87,7 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
     public XWPFDocumentXMLBodyHandler(XWPFBodyContentsHandler bodyContentsHandler,
                                       Map<String, String> hyperlinks) {
         this.bodyContentsHandler = bodyContentsHandler;
-        this.hyperlinks = hyperlinks;
+        this.linkedRelationships = hyperlinks;
     }
 
 
@@ -111,6 +120,39 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
         if (inACChoiceDepth > 0) {
             return;
         }
+        if (uri == null || uri.equals(O_NS)) {
+            if (localName.equals("OLEObject")) {
+                String type = null;
+                String refId = null;
+                //TODO: want to get ProgID?
+                for (int i = 0; i < atts.getLength(); i++) {
+                    String attLocalName = atts.getLocalName(i);
+                    String attValue = atts.getValue(i);
+                    if (attLocalName.equals("Type")) {
+                        type = attValue;
+                    } else if (OFFICE_DOC_RELATIONSHIP_NS.equals(atts.getURI(i)) && attLocalName.equals("id")) {
+                        refId = attValue;
+                    }
+                }
+                if ("Embed".equals(type)) {
+                    bodyContentsHandler.embeddedOLERef(refId);
+                }
+            }
+        }
+
+        if (uri == null || uri.equals(PIC_NS)) {
+            if ("pic".equals(localName)) {
+                inPic = true;
+            } else if ("cNvPr".equals(localName)) {
+                picDescription = atts.getValue("", "descr");
+            }
+        }
+
+        if (uri == null || uri.equals(DRAWING_MAIN_NS)) {
+            if ("blip".equals(localName)) {
+                picRId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "embed");
+            }
+        }
 
         if (uri == null || uri.equals(W_NS)) {
             if (localName.equals("p")) {
@@ -151,7 +193,7 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
                 String hyperlinkId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
                 String hyperlink = null;
                 if (hyperlinkId != null) {
-                    hyperlink = hyperlinks.get(hyperlinkId);
+                    hyperlink = linkedRelationships.get(hyperlinkId);
                 }
                 bodyContentsHandler.hyperlinkStart(hyperlink);
             } else if (localName.equals("footnoteReference")) {
@@ -203,6 +245,20 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
                 inACFallbackDepth--;
             }
         }
+
+        if (PIC_NS.equals(uri)) {
+            if ("pic".equals(localName)) {
+                String picFileName = null;
+                if (picRId != null) {
+                    picFileName = linkedRelationships.get(picRId);
+                }
+                bodyContentsHandler.embeddedPicRef(picFileName, picDescription);
+                picDescription = null;
+                picRId = null;
+                inPic = false;
+            }
+
+        }
         if (uri == null || uri.equals(W_NS)) {
             if (inACChoiceDepth > 0) {
                 return;
@@ -309,5 +365,9 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
         void endnoteReference(String id);
 
         boolean getIncludeMoveFromText();
+
+        void embeddedOLERef(String refId);
+
+        void embeddedPicRef(String picFileName, String picDescription);
     }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/806eaf8b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
index 4ee7a4f..ee6bb85 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
@@ -353,6 +353,16 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
         public boolean getIncludeMoveFromText() {
             return false;
         }
+
+        @Override
+        public void embeddedOLERef(String refId) {
+            //no-op
+        }
+
+        @Override
+        public void embeddedPicRef(String picFileName, String picDescription) {
+            //no-op
+        }
     }
 }
 

http://git-wip-us.apache.org/repos/asf/tika/blob/806eaf8b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
index d62e270..cd28583 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
@@ -24,6 +24,7 @@ import org.apache.tika.parser.microsoft.OfficeParserConfig;
 import org.apache.tika.parser.microsoft.ooxml.XWPFListManager;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
 
 public class XWPFTikaBodyPartHandler implements XWPFDocumentXMLBodyHandler.XWPFBodyContentsHandler {
 
@@ -248,6 +249,43 @@ public class XWPFTikaBodyPartHandler implements XWPFDocumentXMLBodyHandler.XWPFB
         return includeMoveFromText;
     }
 
+    @Override
+    public void embeddedOLERef(String relId) {
+        if (relId == null) {
+            return;
+        }
+        try {
+            AttributesImpl attributes = new AttributesImpl();
+            attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+            attributes.addAttribute("", "id", "id", "CDATA", relId);
+            xhtml.startElement("div", attributes);
+            xhtml.endElement("div");
+
+        } catch (SAXException e) {
+
+        }
+    }
+
+    @Override
+    public void embeddedPicRef(String picFileName, String picDescription) {
+
+        try {
+            AttributesImpl attr = new AttributesImpl();
+            if (picFileName != null) {
+                attr.addAttribute("", "src", "src", "CDATA", "embedded:" + picFileName);
+            }
+            if (picDescription != null) {
+                attr.addAttribute("", "alt", "alt", "CDATA", picDescription);
+            }
+
+            xhtml.startElement("img", attr);
+            xhtml.endElement("img");
+
+        } catch (SAXException e) {
+
+        }
+    }
+
     private void closeStyleTags() throws SAXException {
         if (isItalics) {
             xhtml.endElement("i");

http://git-wip-us.apache.org/repos/asf/tika/blob/806eaf8b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
index 22e5644..f4a1aeb 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
@@ -22,6 +22,7 @@ import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 
 import java.io.ByteArrayOutputStream;
+import java.io.File;
 import java.io.InputStream;
 import java.io.PrintStream;
 import java.util.Arrays;
@@ -38,6 +39,7 @@ import org.apache.tika.metadata.OfficeOpenXMLCore;
 import org.apache.tika.metadata.OfficeOpenXMLExtended;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.EmptyParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.PasswordProvider;
@@ -418,13 +420,13 @@ public class SXWPFExtractorTest extends TikaTest {
 
     // TIKA-989:
     @Test
-    @Ignore("TODO")
     public void testEmbeddedPDF() throws Exception {
         String xml = getXML("testWORD_embedded_pdf.docx", parseContext).xml;
+        System.out.println(xml);
         int i = xml.indexOf("Here is the pdf file:");
-        int j = xml.indexOf("<div class=\"embedded\" id=\"rId5\"/>");
+        int j = xml.indexOf("<div class=\"embedded\" id=\"rId5\" />");
         int k = xml.indexOf("Bye Bye");
-        int l = xml.indexOf("<div class=\"embedded\" id=\"rId6\"/>");
+        int l = xml.indexOf("<div class=\"embedded\" id=\"rId6\" />");
         int m = xml.indexOf("Bye for real.");
         assertTrue(i != -1);
         assertTrue(j != -1);
@@ -696,5 +698,41 @@ public class SXWPFExtractorTest extends TikaTest {
         assertContainsAtLeast(minExpected, metadataList);//, parseContext));
     }
 
+    @Test
+    public void testEmbedded() throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata("testWORD_embeded.docx", parseContext);
+        Metadata main = metadataList.get(0);
+        String content = main.get(RecursiveParserWrapper.TIKA_CONTENT);
+        //make sure mark up is there
+        assertContains("<img src=\"embedded:image2.jpeg\" alt=\"A description...\" />",
+                content);
+
+        assertContains("<div class=\"embedded\" id=\"rId8\" />",
+                content);
+
+        assertEquals(16, metadataList.size());
+    }
+
+    @Test
+    public void iterate() throws Exception {
+        ParseContext context = new ParseContext();
+        context.set(Parser.class, EmptyParser.INSTANCE);
+        for (File f : getResourceAsFile("/test-documents").listFiles()) {
+            if (! f.getName().equals("testWORD_embeded.docx")) {
+                continue;
+            }
+            if (f.getName().endsWith("docx") || f.getName().endsWith(".docm")) {
+                try {
+                    XMLResult r = getXML(f.getName(), context);
+                    if (r.xml.contains("<img")) {
+                        System.out.println(f.getName());
+                    }
+                    System.out.println(r.xml);
+                } catch (Exception e) {
+                    e.printStackTrace();
+                }
+            }
+        }
+    }
 
 }