You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by dm...@apache.org on 2020/11/09 09:36:05 UTC

[tika] 02/02: TIKA-3156: Added ability to read hyperlinked images from ODT files

This is an automated email from the ASF dual-hosted git repository.

dmeikle pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 38d226801725ce3742bbc29ca62400cee115927a
Author: David Meikle <dm...@apache.org>
AuthorDate: Sun Nov 8 23:23:06 2020 +0000

    TIKA-3156: Added ability to read hyperlinked images from ODT files
---
 .../tika/parser/odf/OpenDocumentBodyHandler.java   |  13 ++++++++++++
 .../apache/tika/parser/odf/OpenDocumentParser.java |  23 ++++++++++++++-------
 .../org/apache/tika/parser/odf/ODFParserTest.java  |   8 +++++++
 .../test-documents/testODTEmbeddedImageLink.odt    | Bin 0 -> 32873 bytes
 4 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
index 0349c7d..104f510 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
@@ -160,6 +160,9 @@ class OpenDocumentBodyHandler extends ElementMappingContentHandler {
         MAPPINGS.put(
                 new QName(TEXT_NS, "a"),
                 new TargetElement(XHTML, "a", aAttsMapping));
+        MAPPINGS.put(
+                new QName(DRAW_NS, "a"),
+                new TargetElement(XHTML, "a", aAttsMapping));
 
         // create HTML tables from table:-tags
         MAPPINGS.put(
@@ -432,6 +435,16 @@ class OpenDocumentBodyHandler extends ElementMappingContentHandler {
             String namespaceURI, String localName, String qName,
             Attributes attrs) throws SAXException {
 
+        if (DRAW_NS.equals(namespaceURI) && "image".equals(localName)) {
+            String link = attrs.getValue(XLINK_NS, "href");
+            AttributesImpl attr = new AttributesImpl();
+            if (!StringUtils.isEmpty(link)) {
+                attr.addAttribute("", "src", "src", "CDATA", "embedded:" + link);
+            }
+            handler.startElement(XHTMLContentHandler.XHTML, "img", "img", attr);
+            handler.endElement(XHTMLContentHandler.XHTML, "img", "img");
+        }
+
         if (BINARY_DATA.equals(localName)) {
             inBinaryData = true;
             return;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
index b408ccf..851d3b6 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
@@ -32,9 +32,7 @@ import java.util.zip.ZipInputStream;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.io.input.CloseShieldInputStream;
 import org.apache.tika.config.Field;
-import org.apache.tika.detect.XmlRootExtractor;
 import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
@@ -101,6 +99,8 @@ public class OpenDocumentParser extends AbstractParser {
 
     private static final String META_NAME = "meta.xml";
 
+    private EmbeddedDocumentUtil embeddedDocumentUtil;
+
     private Parser meta = new OpenDocumentMetaParser();
 
     private Parser content = new OpenDocumentContentParser();
@@ -132,6 +132,8 @@ public class OpenDocumentParser extends AbstractParser {
             Metadata metadata, ParseContext context)
             throws IOException, SAXException, TikaException {
 
+        embeddedDocumentUtil = new EmbeddedDocumentUtil(context);
+
         // Open the Zip stream
         // Use a File if we can, and an already open zip is even better
         ZipFile zipFile = null;
@@ -245,21 +247,28 @@ public class OpenDocumentParser extends AbstractParser {
             if (embeddedName.contains("Thumbnails/") ||
                     embeddedName.contains("Pictures/")) {
 
-                EmbeddedDocumentExtractor embeddedDocumentExtractor =
-                        EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
                 Metadata embeddedMetadata = new Metadata();
-                embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, entry.getName());
+                TikaInputStream stream = TikaInputStream.get(zip);
+
+                embeddedMetadata.set(Metadata.RESOURCE_NAME_KEY, entry.getName());
                 if (embeddedName.startsWith("Thumbnails/")) {
                     embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                             TikaCoreProperties.EmbeddedResourceType.THUMBNAIL.toString());
                 }
+
                 if (embeddedName.contains("Pictures/")) {
                     embeddedMetadata.set(TikaMetadataKeys.EMBEDDED_RESOURCE_TYPE,
                             TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
+
+                    MediaType embeddedMimeType = embeddedDocumentUtil.getDetector().detect(stream, embeddedMetadata);
+                    if (embeddedMimeType != null) {
+                        embeddedMetadata.set(Metadata.CONTENT_TYPE, embeddedMimeType.toString());
+                    }
+                    stream.reset();
                 }
 
-                if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
-                    embeddedDocumentExtractor.parseEmbedded(zip,
+                if (embeddedDocumentUtil.shouldParseEmbedded(embeddedMetadata)) {
+                    embeddedDocumentUtil.parseEmbedded(stream,
                             new EmbeddedContentHandler(handler), embeddedMetadata, false);
                 }
             } else if (extractMacros && embeddedName.contains("Basic/")) {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
index 719aae5..0b0e2ad 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
@@ -421,6 +421,14 @@ public class ODFParserTest extends TikaTest {
         assertEquals(3, metadataList.size());
     }
 
+    @Test
+    public void testEmbeddedImageAndLink() throws Exception {
+        String xml = getXML("testODTEmbeddedImageLink.odt").xml;
+        assertContains("<a href=\"https://tika.apache.org/\">" +
+                "<img src=\"embedded:Pictures/10000201000001240000006457F5B1D1243E0671.png\" />" +
+                "<span>Visit Tika</span></a>", xml);
+    }
+
     @Test(expected = IOException.class)
     public void testInvalidFromStream() throws Exception {
         try (InputStream is = this.getClass().getResource(
diff --git a/tika-parsers/src/test/resources/test-documents/testODTEmbeddedImageLink.odt b/tika-parsers/src/test/resources/test-documents/testODTEmbeddedImageLink.odt
new file mode 100644
index 0000000..88970f7
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testODTEmbeddedImageLink.odt differ