You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by dm...@apache.org on 2020/11/09 09:36:05 UTC
[tika] 02/02: TIKA-3156: Added ability to read hyperlinked images
from ODT files
This is an automated email from the ASF dual-hosted git repository.
dmeikle pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 38d226801725ce3742bbc29ca62400cee115927a
Author: David Meikle <dm...@apache.org>
AuthorDate: Sun Nov 8 23:23:06 2020 +0000
TIKA-3156: Added ability to read hyperlinked images from ODT files
---
.../tika/parser/odf/OpenDocumentBodyHandler.java | 13 ++++++++++++
.../apache/tika/parser/odf/OpenDocumentParser.java | 23 ++++++++++++++-------
.../org/apache/tika/parser/odf/ODFParserTest.java | 8 +++++++
.../test-documents/testODTEmbeddedImageLink.odt | Bin 0 -> 32873 bytes
4 files changed, 37 insertions(+), 7 deletions(-)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
index 0349c7d..104f510 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
@@ -160,6 +160,9 @@ class OpenDocumentBodyHandler extends ElementMappingContentHandler {
MAPPINGS.put(
new QName(TEXT_NS, "a"),
new TargetElement(XHTML, "a", aAttsMapping));
+ MAPPINGS.put(
+ new QName(DRAW_NS, "a"),
+ new TargetElement(XHTML, "a", aAttsMapping));
// create HTML tables from table:-tags
MAPPINGS.put(
@@ -432,6 +435,16 @@ class OpenDocumentBodyHandler extends ElementMappingContentHandler {
String namespaceURI, String localName, String qName,
Attributes attrs) throws SAXException {
+ if (DRAW_NS.equals(namespaceURI) && "image".equals(localName)) {
+ String link = attrs.getValue(XLINK_NS, "href");
+ AttributesImpl attr = new AttributesImpl();
+ if (!StringUtils.isEmpty(link)) {
+ attr.addAttribute("", "src", "src", "CDATA", "embedded:" + link);
+ }
+ handler.startElement(XHTMLContentHandler.XHTML, "img", "img", attr);
+ handler.endElement(XHTMLContentHandler.XHTML, "img", "img");
+ }
+
if (BINARY_DATA.equals(localName)) {
inBinaryData = true;
return;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
index b408ccf..851d3b6 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
@@ -32,9 +32,7 @@ import java.util.zip.ZipInputStream;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.tika.config.Field;
-import org.apache.tika.detect.XmlRootExtractor;
import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@@ -101,6 +99,8 @@ public class OpenDocumentParser extends AbstractParser {
private static final String META_NAME = "meta.xml";
+ private EmbeddedDocumentUtil embeddedDocumentUtil;
+
private Parser meta = new OpenDocumentMetaParser();
private Parser content = new OpenDocumentContentParser();
@@ -132,6 +132,8 @@ public class OpenDocumentParser extends AbstractParser {
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
+ embeddedDocumentUtil = new EmbeddedDocumentUtil(context);
+
// Open the Zip stream
// Use a File if we can, and an already open zip is even better
ZipFile zipFile = null;
@@ -245,21 +247,28 @@ public class OpenDocumentParser extends AbstractParser {
if (embeddedName.contains("Thumbnails/") ||
embeddedName.contains("Pictures/")) {
- EmbeddedDocumentExtractor embeddedDocumentExtractor =
- EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
Metadata embeddedMetadata = new Metadata();
- embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, entry.getName());
+ TikaInputStream stream = TikaInputStream.get(zip);
+
+ embeddedMetadata.set(Metadata.RESOURCE_NAME_KEY, entry.getName());
if (embeddedName.startsWith("Thumbnails/")) {
embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.THUMBNAIL.toString());
}
+
if (embeddedName.contains("Pictures/")) {
embeddedMetadata.set(TikaMetadataKeys.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
+
+ MediaType embeddedMimeType = embeddedDocumentUtil.getDetector().detect(stream, embeddedMetadata);
+ if (embeddedMimeType != null) {
+ embeddedMetadata.set(Metadata.CONTENT_TYPE, embeddedMimeType.toString());
+ }
+ stream.reset();
}
- if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
- embeddedDocumentExtractor.parseEmbedded(zip,
+ if (embeddedDocumentUtil.shouldParseEmbedded(embeddedMetadata)) {
+ embeddedDocumentUtil.parseEmbedded(stream,
new EmbeddedContentHandler(handler), embeddedMetadata, false);
}
} else if (extractMacros && embeddedName.contains("Basic/")) {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
index 719aae5..0b0e2ad 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
@@ -421,6 +421,14 @@ public class ODFParserTest extends TikaTest {
assertEquals(3, metadataList.size());
}
+ @Test
+ public void testEmbeddedImageAndLink() throws Exception {
+ String xml = getXML("testODTEmbeddedImageLink.odt").xml;
+ assertContains("<a href=\"https://tika.apache.org/\">" +
+ "<img src=\"embedded:Pictures/10000201000001240000006457F5B1D1243E0671.png\" />" +
+ "<span>Visit Tika</span></a>", xml);
+ }
+
@Test(expected = IOException.class)
public void testInvalidFromStream() throws Exception {
try (InputStream is = this.getClass().getResource(
diff --git a/tika-parsers/src/test/resources/test-documents/testODTEmbeddedImageLink.odt b/tika-parsers/src/test/resources/test-documents/testODTEmbeddedImageLink.odt
new file mode 100644
index 0000000..88970f7
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testODTEmbeddedImageLink.odt differ