You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by dm...@apache.org on 2020/11/09 09:36:03 UTC

[tika] branch branch_1x updated (809d6bb -> 38d2268)

This is an automated email from the ASF dual-hosted git repository.

dmeikle pushed a change to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from 809d6bb  bump maven bundle to 5.1.1 for compatibility with Java 15
     new 4778ede  TIKA-3156: Added ability to read hyperlinked images from ODT files
     new 38d2268  TIKA-3156: Added ability to read hyperlinked images from ODT files

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../tika/parser/odf/OpenDocumentBodyHandler.java   |  14 ++++++++++++
 .../apache/tika/parser/odf/OpenDocumentParser.java |  25 +++++++++++++--------
 .../org/apache/tika/parser/odf/ODFParserTest.java  |   8 +++++++
 .../test-documents/testODTEmbeddedImageLink.odt    | Bin 0 -> 32873 bytes
 4 files changed, 38 insertions(+), 9 deletions(-)
 create mode 100644 tika-parsers/src/test/resources/test-documents/testODTEmbeddedImageLink.odt


[tika] 01/02: TIKA-3156: Added ability to read hyperlinked images from ODT files

Posted by dm...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

dmeikle pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 4778eded858bdbc23ac6156085eb19e13e8a77cf
Author: David Meikle <dm...@apache.org>
AuthorDate: Mon Nov 9 09:28:26 2020 +0000

    TIKA-3156: Added ability to read hyperlinked images from ODT files
---
 .../main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java   | 1 +
 .../src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java    | 2 --
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
index 9d18f64..0349c7d 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
@@ -17,6 +17,7 @@
 package org.apache.tika.parser.odf;
 
 import org.apache.commons.codec.binary.Base64;
+import org.apache.commons.lang3.StringUtils;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
 import org.apache.tika.io.TikaInputStream;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
index a784aad..b408ccf 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
@@ -26,13 +26,11 @@ import java.util.Enumeration;
 import java.util.HashSet;
 import java.util.Set;
 import java.util.zip.ZipEntry;
-import java.util.zip.ZipException;
 import java.util.zip.ZipFile;
 import java.util.zip.ZipInputStream;
 
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.io.input.CloseShieldInputStream;
-import org.apache.commons.lang3.StringUtils;
 import org.apache.tika.config.Field;
 import org.apache.tika.detect.XmlRootExtractor;
 import org.apache.tika.exception.TikaException;


[tika] 02/02: TIKA-3156: Added ability to read hyperlinked images from ODT files

Posted by dm...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

dmeikle pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 38d226801725ce3742bbc29ca62400cee115927a
Author: David Meikle <dm...@apache.org>
AuthorDate: Sun Nov 8 23:23:06 2020 +0000

    TIKA-3156: Added ability to read hyperlinked images from ODT files
---
 .../tika/parser/odf/OpenDocumentBodyHandler.java   |  13 ++++++++++++
 .../apache/tika/parser/odf/OpenDocumentParser.java |  23 ++++++++++++++-------
 .../org/apache/tika/parser/odf/ODFParserTest.java  |   8 +++++++
 .../test-documents/testODTEmbeddedImageLink.odt    | Bin 0 -> 32873 bytes
 4 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
index 0349c7d..104f510 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
@@ -160,6 +160,9 @@ class OpenDocumentBodyHandler extends ElementMappingContentHandler {
         MAPPINGS.put(
                 new QName(TEXT_NS, "a"),
                 new TargetElement(XHTML, "a", aAttsMapping));
+        MAPPINGS.put(
+                new QName(DRAW_NS, "a"),
+                new TargetElement(XHTML, "a", aAttsMapping));
 
         // create HTML tables from table:-tags
         MAPPINGS.put(
@@ -432,6 +435,16 @@ class OpenDocumentBodyHandler extends ElementMappingContentHandler {
             String namespaceURI, String localName, String qName,
             Attributes attrs) throws SAXException {
 
+        if (DRAW_NS.equals(namespaceURI) && "image".equals(localName)) {
+            String link = attrs.getValue(XLINK_NS, "href");
+            AttributesImpl attr = new AttributesImpl();
+            if (!StringUtils.isEmpty(link)) {
+                attr.addAttribute("", "src", "src", "CDATA", "embedded:" + link);
+            }
+            handler.startElement(XHTMLContentHandler.XHTML, "img", "img", attr);
+            handler.endElement(XHTMLContentHandler.XHTML, "img", "img");
+        }
+
         if (BINARY_DATA.equals(localName)) {
             inBinaryData = true;
             return;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
index b408ccf..851d3b6 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
@@ -32,9 +32,7 @@ import java.util.zip.ZipInputStream;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.io.input.CloseShieldInputStream;
 import org.apache.tika.config.Field;
-import org.apache.tika.detect.XmlRootExtractor;
 import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
@@ -101,6 +99,8 @@ public class OpenDocumentParser extends AbstractParser {
 
     private static final String META_NAME = "meta.xml";
 
+    private EmbeddedDocumentUtil embeddedDocumentUtil;
+
     private Parser meta = new OpenDocumentMetaParser();
 
     private Parser content = new OpenDocumentContentParser();
@@ -132,6 +132,8 @@ public class OpenDocumentParser extends AbstractParser {
             Metadata metadata, ParseContext context)
             throws IOException, SAXException, TikaException {
 
+        embeddedDocumentUtil = new EmbeddedDocumentUtil(context);
+
         // Open the Zip stream
         // Use a File if we can, and an already open zip is even better
         ZipFile zipFile = null;
@@ -245,21 +247,28 @@ public class OpenDocumentParser extends AbstractParser {
             if (embeddedName.contains("Thumbnails/") ||
                     embeddedName.contains("Pictures/")) {
 
-                EmbeddedDocumentExtractor embeddedDocumentExtractor =
-                        EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
                 Metadata embeddedMetadata = new Metadata();
-                embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, entry.getName());
+                TikaInputStream stream = TikaInputStream.get(zip);
+
+                embeddedMetadata.set(Metadata.RESOURCE_NAME_KEY, entry.getName());
                 if (embeddedName.startsWith("Thumbnails/")) {
                     embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                             TikaCoreProperties.EmbeddedResourceType.THUMBNAIL.toString());
                 }
+
                 if (embeddedName.contains("Pictures/")) {
                     embeddedMetadata.set(TikaMetadataKeys.EMBEDDED_RESOURCE_TYPE,
                             TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
+
+                    MediaType embeddedMimeType = embeddedDocumentUtil.getDetector().detect(stream, embeddedMetadata);
+                    if (embeddedMimeType != null) {
+                        embeddedMetadata.set(Metadata.CONTENT_TYPE, embeddedMimeType.toString());
+                    }
+                    stream.reset();
                 }
 
-                if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
-                    embeddedDocumentExtractor.parseEmbedded(zip,
+                if (embeddedDocumentUtil.shouldParseEmbedded(embeddedMetadata)) {
+                    embeddedDocumentUtil.parseEmbedded(stream,
                             new EmbeddedContentHandler(handler), embeddedMetadata, false);
                 }
             } else if (extractMacros && embeddedName.contains("Basic/")) {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
index 719aae5..0b0e2ad 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
@@ -421,6 +421,14 @@ public class ODFParserTest extends TikaTest {
         assertEquals(3, metadataList.size());
     }
 
+    @Test
+    public void testEmbeddedImageAndLink() throws Exception {
+        String xml = getXML("testODTEmbeddedImageLink.odt").xml;
+        assertContains("<a href=\"https://tika.apache.org/\">" +
+                "<img src=\"embedded:Pictures/10000201000001240000006457F5B1D1243E0671.png\" />" +
+                "<span>Visit Tika</span></a>", xml);
+    }
+
     @Test(expected = IOException.class)
     public void testInvalidFromStream() throws Exception {
         try (InputStream is = this.getClass().getResource(
diff --git a/tika-parsers/src/test/resources/test-documents/testODTEmbeddedImageLink.odt b/tika-parsers/src/test/resources/test-documents/testODTEmbeddedImageLink.odt
new file mode 100644
index 0000000..88970f7
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testODTEmbeddedImageLink.odt differ