You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by dm...@apache.org on 2020/11/08 23:58:11 UTC

[tika] branch main updated: TIKA-3156: Added ability to read hyperlinked images from ODT files

This is an automated email from the ASF dual-hosted git repository.

dmeikle pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 2b45667  TIKA-3156: Added ability to read hyperlinked images from ODT files
2b45667 is described below

commit 2b456679200bd8b2e86864ae6db847923d2bc134
Author: David Meikle <dm...@apache.org>
AuthorDate: Sun Nov 8 23:23:06 2020 +0000

    TIKA-3156: Added ability to read hyperlinked images from ODT files
---
 CHANGES.txt                                        |   2 ++
 .../tika/parser/odf/OpenDocumentBodyHandler.java   |  14 +++++++++++++
 .../apache/tika/parser/odf/OpenDocumentParser.java |  23 ++++++++++++++-------
 .../org/apache/tika/parser/odf/ODFParserTest.java  |   8 +++++++
 .../test-documents/testODTEmbeddedImageLink.odt    | Bin 0 -> 32873 bytes
 5 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 4dca553..186f647 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -38,6 +38,8 @@ Release 1.25 - ???
 
    * Add a basic parser for plist files based on com.googlecode.plist:dd-plist (TIKA-3104).
 
+   * Read hyperlinked images from ODT files (TIKA-3156).
+
 Release 1.24.1 - 4/17/2020
 
    * Allow gzip compression of input and output streams for tika-server (TIKA-3073).
diff --git a/tika-parser-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java b/tika-parser-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
index 9d18f64..0aba06e 100644
--- a/tika-parser-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
+++ b/tika-parser-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
@@ -24,6 +24,7 @@ import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.ElementMappingContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.StringUtils;
 import org.xml.sax.Attributes;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
@@ -159,6 +160,9 @@ class OpenDocumentBodyHandler extends ElementMappingContentHandler {
         MAPPINGS.put(
                 new QName(TEXT_NS, "a"),
                 new TargetElement(XHTML, "a", aAttsMapping));
+        MAPPINGS.put(
+                new QName(DRAW_NS, "a"),
+                new TargetElement(XHTML, "a", aAttsMapping));
 
         // create HTML tables from table:-tags
         MAPPINGS.put(
@@ -431,6 +435,16 @@ class OpenDocumentBodyHandler extends ElementMappingContentHandler {
             String namespaceURI, String localName, String qName,
             Attributes attrs) throws SAXException {
 
+        if (DRAW_NS.equals(namespaceURI) && "image".equals(localName)) {
+            String link = attrs.getValue(XLINK_NS, "href");
+            AttributesImpl attr = new AttributesImpl();
+            if (!StringUtils.isEmpty(link)) {
+                attr.addAttribute("", "src", "src", "CDATA", "embedded:" + link);
+            }
+            handler.startElement(XHTMLContentHandler.XHTML, "img", "img", attr);
+            handler.endElement(XHTMLContentHandler.XHTML, "img", "img");
+        }
+
         if (BINARY_DATA.equals(localName)) {
             inBinaryData = true;
             return;
diff --git a/tika-parser-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java b/tika-parser-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
index 451adae..e7e93c5 100644
--- a/tika-parser-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
+++ b/tika-parser-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
@@ -30,9 +30,7 @@ import java.util.zip.ZipFile;
 import java.util.zip.ZipInputStream;
 
 import org.apache.tika.config.Field;
-import org.apache.tika.detect.XmlRootExtractor;
 import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
 import org.apache.tika.io.CloseShieldInputStream;
 import org.apache.tika.io.IOUtils;
@@ -100,6 +98,8 @@ public class OpenDocumentParser extends AbstractParser {
 
     private static final String META_NAME = "meta.xml";
 
+    private EmbeddedDocumentUtil embeddedDocumentUtil;
+
     private Parser meta = new OpenDocumentMetaParser();
 
     private Parser content = new OpenDocumentContentParser();
@@ -131,6 +131,8 @@ public class OpenDocumentParser extends AbstractParser {
             Metadata metadata, ParseContext context)
             throws IOException, SAXException, TikaException {
 
+        embeddedDocumentUtil = new EmbeddedDocumentUtil(context);
+
         // Open the Zip stream
         // Use a File if we can, and an already open zip is even better
         ZipFile zipFile = null;
@@ -244,21 +246,28 @@ public class OpenDocumentParser extends AbstractParser {
             if (embeddedName.contains("Thumbnails/") ||
                     embeddedName.contains("Pictures/")) {
 
-                EmbeddedDocumentExtractor embeddedDocumentExtractor =
-                        EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
                 Metadata embeddedMetadata = new Metadata();
-                embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, entry.getName());
+                TikaInputStream stream = TikaInputStream.get(zip);
+
+                embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, entry.getName());
                 if (embeddedName.startsWith("Thumbnails/")) {
                     embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                             TikaCoreProperties.EmbeddedResourceType.THUMBNAIL.toString());
                 }
+
                 if (embeddedName.contains("Pictures/")) {
                     embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                             TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
+
+                    MediaType embeddedMimeType = embeddedDocumentUtil.getDetector().detect(stream, embeddedMetadata);
+                    if (embeddedMimeType != null) {
+                        embeddedMetadata.set(Metadata.CONTENT_TYPE, embeddedMimeType.toString());
+                    }
+                    stream.reset();
                 }
 
-                if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
-                    embeddedDocumentExtractor.parseEmbedded(zip,
+                if (embeddedDocumentUtil.shouldParseEmbedded(embeddedMetadata)) {
+                    embeddedDocumentUtil.parseEmbedded(stream,
                             new EmbeddedContentHandler(handler), embeddedMetadata, false);
                 }
             } else if (extractMacros && embeddedName.contains("Basic/")) {
diff --git a/tika-parser-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java b/tika-parser-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
index f8777af..54eed2b 100644
--- a/tika-parser-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
+++ b/tika-parser-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
@@ -374,6 +374,14 @@ public class ODFParserTest extends TikaTest {
         assertEquals(3, metadataList.size());
     }
 
+    @Test
+    public void testEmbeddedImageAndLink() throws Exception {
+        String xml = getXML("testODTEmbeddedImageLink.odt").xml;
+        assertContains("<a href=\"https://tika.apache.org/\">" +
+                "<img src=\"embedded:Pictures/10000201000001240000006457F5B1D1243E0671.png\" />" +
+                "<span>Visit Tika</span></a>", xml);
+    }
+
     @Test(expected = IOException.class)
     public void testInvalidFromStream() throws Exception {
         try (InputStream is = this.getClass().getResource(
diff --git a/tika-parser-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/testODTEmbeddedImageLink.odt b/tika-parser-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/testODTEmbeddedImageLink.odt
new file mode 100644
index 0000000..88970f7
Binary files /dev/null and b/tika-parser-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/testODTEmbeddedImageLink.odt differ