You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by dm...@apache.org on 2020/11/08 23:58:11 UTC
[tika] branch main updated: TIKA-3156: Added ability to read
hyperlinked images from ODT files
This is an automated email from the ASF dual-hosted git repository.
dmeikle pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 2b45667 TIKA-3156: Added ability to read hyperlinked images from ODT files
2b45667 is described below
commit 2b456679200bd8b2e86864ae6db847923d2bc134
Author: David Meikle <dm...@apache.org>
AuthorDate: Sun Nov 8 23:23:06 2020 +0000
TIKA-3156: Added ability to read hyperlinked images from ODT files
---
CHANGES.txt | 2 ++
.../tika/parser/odf/OpenDocumentBodyHandler.java | 14 +++++++++++++
.../apache/tika/parser/odf/OpenDocumentParser.java | 23 ++++++++++++++-------
.../org/apache/tika/parser/odf/ODFParserTest.java | 8 +++++++
.../test-documents/testODTEmbeddedImageLink.odt | Bin 0 -> 32873 bytes
5 files changed, 40 insertions(+), 7 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 4dca553..186f647 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -38,6 +38,8 @@ Release 1.25 - ???
* Add a basic parser for plist files based on com.googlecode.plist:dd-plist (TIKA-3104).
+ * Read hyperlinked images from ODT files (TIKA-3156).
+
Release 1.24.1 - 4/17/2020
* Allow gzip compression of input and output streams for tika-server (TIKA-3073).
diff --git a/tika-parser-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java b/tika-parser-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
index 9d18f64..0aba06e 100644
--- a/tika-parser-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
+++ b/tika-parser-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
@@ -24,6 +24,7 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.ElementMappingContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.StringUtils;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -159,6 +160,9 @@ class OpenDocumentBodyHandler extends ElementMappingContentHandler {
MAPPINGS.put(
new QName(TEXT_NS, "a"),
new TargetElement(XHTML, "a", aAttsMapping));
+ MAPPINGS.put(
+ new QName(DRAW_NS, "a"),
+ new TargetElement(XHTML, "a", aAttsMapping));
// create HTML tables from table:-tags
MAPPINGS.put(
@@ -431,6 +435,16 @@ class OpenDocumentBodyHandler extends ElementMappingContentHandler {
String namespaceURI, String localName, String qName,
Attributes attrs) throws SAXException {
+ if (DRAW_NS.equals(namespaceURI) && "image".equals(localName)) {
+ String link = attrs.getValue(XLINK_NS, "href");
+ AttributesImpl attr = new AttributesImpl();
+ if (!StringUtils.isEmpty(link)) {
+ attr.addAttribute("", "src", "src", "CDATA", "embedded:" + link);
+ }
+ handler.startElement(XHTMLContentHandler.XHTML, "img", "img", attr);
+ handler.endElement(XHTMLContentHandler.XHTML, "img", "img");
+ }
+
if (BINARY_DATA.equals(localName)) {
inBinaryData = true;
return;
diff --git a/tika-parser-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java b/tika-parser-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
index 451adae..e7e93c5 100644
--- a/tika-parser-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
+++ b/tika-parser-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
@@ -30,9 +30,7 @@ import java.util.zip.ZipFile;
import java.util.zip.ZipInputStream;
import org.apache.tika.config.Field;
-import org.apache.tika.detect.XmlRootExtractor;
import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.CloseShieldInputStream;
import org.apache.tika.io.IOUtils;
@@ -100,6 +98,8 @@ public class OpenDocumentParser extends AbstractParser {
private static final String META_NAME = "meta.xml";
+ private EmbeddedDocumentUtil embeddedDocumentUtil;
+
private Parser meta = new OpenDocumentMetaParser();
private Parser content = new OpenDocumentContentParser();
@@ -131,6 +131,8 @@ public class OpenDocumentParser extends AbstractParser {
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
+ embeddedDocumentUtil = new EmbeddedDocumentUtil(context);
+
// Open the Zip stream
// Use a File if we can, and an already open zip is even better
ZipFile zipFile = null;
@@ -244,21 +246,28 @@ public class OpenDocumentParser extends AbstractParser {
if (embeddedName.contains("Thumbnails/") ||
embeddedName.contains("Pictures/")) {
- EmbeddedDocumentExtractor embeddedDocumentExtractor =
- EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
Metadata embeddedMetadata = new Metadata();
- embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, entry.getName());
+ TikaInputStream stream = TikaInputStream.get(zip);
+
+ embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, entry.getName());
if (embeddedName.startsWith("Thumbnails/")) {
embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.THUMBNAIL.toString());
}
+
if (embeddedName.contains("Pictures/")) {
embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
+
+ MediaType embeddedMimeType = embeddedDocumentUtil.getDetector().detect(stream, embeddedMetadata);
+ if (embeddedMimeType != null) {
+ embeddedMetadata.set(Metadata.CONTENT_TYPE, embeddedMimeType.toString());
+ }
+ stream.reset();
}
- if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
- embeddedDocumentExtractor.parseEmbedded(zip,
+ if (embeddedDocumentUtil.shouldParseEmbedded(embeddedMetadata)) {
+ embeddedDocumentUtil.parseEmbedded(stream,
new EmbeddedContentHandler(handler), embeddedMetadata, false);
}
} else if (extractMacros && embeddedName.contains("Basic/")) {
diff --git a/tika-parser-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java b/tika-parser-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
index f8777af..54eed2b 100644
--- a/tika-parser-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
+++ b/tika-parser-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
@@ -374,6 +374,14 @@ public class ODFParserTest extends TikaTest {
assertEquals(3, metadataList.size());
}
+ @Test
+ public void testEmbeddedImageAndLink() throws Exception {
+ String xml = getXML("testODTEmbeddedImageLink.odt").xml;
+ assertContains("<a href=\"https://tika.apache.org/\">" +
+ "<img src=\"embedded:Pictures/10000201000001240000006457F5B1D1243E0671.png\" />" +
+ "<span>Visit Tika</span></a>", xml);
+ }
+
@Test(expected = IOException.class)
public void testInvalidFromStream() throws Exception {
try (InputStream is = this.getClass().getResource(
diff --git a/tika-parser-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/testODTEmbeddedImageLink.odt b/tika-parser-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/testODTEmbeddedImageLink.odt
new file mode 100644
index 0000000..88970f7
Binary files /dev/null and b/tika-parser-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/testODTEmbeddedImageLink.odt differ