You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/02/07 19:26:13 UTC
[tika] branch main updated: TIKA-3968 (#948)
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 79d4ba86a TIKA-3968 (#948)
79d4ba86a is described below
commit 79d4ba86a5ad40873fe75f0f5281b7d68b77ad69
Author: Tim Allison <ta...@apache.org>
AuthorDate: Tue Feb 7 14:26:00 2023 -0500
TIKA-3968 (#948)
* TIKA-3968 -- extract actual embedded file names from associated EMF files in docx
---
CHANGES.txt | 2 +
.../main/java/org/apache/tika/metadata/Office.java | 6 +
.../apache/tika/parser/microsoft/EMFParser.java | 113 +++++++++++----
.../tika/parser/microsoft/HSLFExtractor.java | 1 +
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 91 +++++++++---
.../microsoft/ooxml/EmbeddedPartMetadata.java | 69 +++++++++
.../ooxml/XWPFWordExtractorDecorator.java | 161 ++++++++++++++++-----
.../tika/parser/microsoft/EMFParserTest.java | 12 ++
.../ooxml/OOXMLContainerExtractionTest.java | 2 +-
.../testEMF_iconOnlyLongFilename.emf | Bin 0 -> 8300 bytes
.../test-documents/testWORD_EMFAndAttachments.docx | Bin 0 -> 61769 bytes
.../parser/microsoft/ooxml/OOXMLParserTest.java | 56 +++++++
.../parser/microsoft/ooxml/SXWPFExtractorTest.java | 100 +++++++++++++
.../org/apache/tika/parser/pdf/PDFParserTest.java | 2 +-
14 files changed, 535 insertions(+), 80 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index fcb3355d8..af11a2430 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
Release 2.7.1 - ???
+ * Improve extraction of embedded file names in .docx (TIKA-3968).
+
* Normalize author, title, subject and description to their Dublin Core
properties in the HTMLParser (TIKA-3963).
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Office.java b/tika-core/src/main/java/org/apache/tika/metadata/Office.java
index 8c9243f94..aff57f701 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Office.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Office.java
@@ -168,4 +168,10 @@ public interface Office {
Property MAPI_MESSAGE_CLIENT_SUBMIT_TIME = Property.internalDate(
PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
"mapi-msg-client-submit-time");
+
+ /**
+ * Embedded files may have a "progID" associated with them, such as
+ * Word.Document.12 or AcroExch.Document.DC
+ */
+ Property PROG_ID = Property.internalText("msoffice:progID");
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java
index f69975685..c82cc3e8a 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java
@@ -28,6 +28,7 @@ import org.apache.poi.hemf.record.emf.HemfRecordType;
import org.apache.poi.hemf.record.emf.HemfText;
import org.apache.poi.hemf.usermodel.HemfPicture;
import org.apache.poi.util.RecordFormatException;
+import org.apache.poi.util.StringUtil;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -36,6 +37,7 @@ import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
@@ -54,6 +56,11 @@ import org.apache.tika.sax.XHTMLContentHandler;
*/
public class EMFParser extends AbstractParser {
+ public static Property EMF_ICON_ONLY = Property.internalBoolean("emf:iconOnly");
+ public static Property EMF_ICON_STRING = Property.internalText("emf:iconString");
+
+ private static String ICON_ONLY = "IconOnly";
+
private static final MediaType MEDIA_TYPE = MediaType.image("emf");
private static final MediaType WMF_MEDIA_TYPE = MediaType.image("wmf");
@@ -88,50 +95,48 @@ public class EMFParser extends AbstractParser {
xhtml.startDocument();
try {
HemfPicture ex = new HemfPicture(stream);
- double lastY = -1;
- double lastX = -1;
+ ParseState parseState = new ParseState();
long fudgeFactorX = 1000;//derive this from the font or frame/bounds information
StringBuilder buffer = new StringBuilder();
+ //iterate through the records. if you hit IconOnly in a comment
+ //and it is the first IconOnly, grab the string in the next comment record
+ //and that'll be the full name of the file.
for (HemfRecord record : ex) {
+ parseState.isIconOnly = false;
if (record.getEmfRecordType() == HemfRecordType.comment) {
- HemfComment.EmfCommentData commentData =
- ((HemfComment.EmfComment) record).getCommentData();
- if (commentData instanceof HemfComment.EmfCommentDataMultiformats) {
- if (embeddedDocumentExtractor == null) {
- embeddedDocumentExtractor =
- EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
- }
- handleMultiFormats((HemfComment.EmfCommentDataMultiformats) commentData,
- xhtml, embeddedDocumentExtractor);
- } else if (commentData instanceof HemfComment.EmfCommentDataWMF) {
- if (embeddedDocumentExtractor == null) {
- embeddedDocumentExtractor =
- EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
- }
- handleWMF(((HemfComment.EmfCommentDataWMF) commentData).getWMFData(), xhtml,
- embeddedDocumentExtractor);
- }
+ handleCommentData(
+ ((HemfComment.EmfComment) record).getCommentData(), parseState, xhtml, context);
} else if (record.getEmfRecordType().equals(HemfRecordType.extTextOutW)) {
-
HemfText.EmfExtTextOutW extTextOutW = (HemfText.EmfExtTextOutW) record;
//change equality to delta diff;
- if (lastY > -1 && lastY != extTextOutW.getReference().getY()) {
+ if (parseState.lastY > -1 &&
+ parseState.lastY != extTextOutW.getReference().getY()) {
xhtml.startElement("p");
xhtml.characters(buffer.toString());
xhtml.endElement("p");
buffer.setLength(0);
- lastX = -1;
+ parseState.lastX = -1;
}
- if (lastX > -1 && extTextOutW.getReference().getX() - lastX > fudgeFactorX) {
+ if (parseState.lastX > -1 && extTextOutW.getReference().getX() -
+ parseState.lastX > fudgeFactorX) {
buffer.append(" ");
}
String txt = extTextOutW.getText();
buffer.append(txt);
- lastY = extTextOutW.getReference().getY();
- lastX = extTextOutW.getReference().getX();
+ parseState.lastY = extTextOutW.getReference().getY();
+ parseState.lastX = extTextOutW.getReference().getX();
+ }
+ if (parseState.isIconOnly) {
+ parseState.lastWasIconOnly = true;
+ } else {
+ parseState.lastWasIconOnly = false;
}
}
+ if (parseState.iconOnlyString != null) {
+ metadata.set(EMF_ICON_ONLY, true);
+ metadata.set(EMF_ICON_STRING, parseState.iconOnlyString);
+ }
if (buffer.length() > 0) {
xhtml.startElement("p");
xhtml.characters(buffer.toString());
@@ -146,6 +151,53 @@ public class EMFParser extends AbstractParser {
xhtml.endDocument();
}
+ private void handleCommentData(
+ HemfComment.EmfCommentData commentData, ParseState parseState,
+ XHTMLContentHandler xhtml, ParseContext context)
+ throws IOException, TikaException, SAXException {
+
+ if (commentData instanceof HemfComment.EmfCommentDataMultiformats) {
+ if (parseState.extractor == null) {
+ parseState.extractor =
+ EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
+ }
+ handleMultiFormats((HemfComment.EmfCommentDataMultiformats) commentData,
+ xhtml, parseState.extractor);
+ } else if (commentData instanceof HemfComment.EmfCommentDataWMF) {
+ if (parseState.extractor == null) {
+ parseState.extractor =
+ EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
+ }
+ handleWMF(((HemfComment.EmfCommentDataWMF) commentData).getWMFData(), xhtml,
+ parseState.extractor);
+ } else if (commentData instanceof HemfComment.EmfCommentDataGeneric) {
+ String val =
+ tryToReadAsString((((HemfComment.EmfCommentDataGeneric) commentData).getPrivateData()));
+ if (ICON_ONLY.equals(val) && parseState.hitIconOnly == false) {
+ parseState.hitIconOnly = true;
+ parseState.isIconOnly = true;
+ } else if (parseState.lastWasIconOnly && parseState.iconOnlyString == null) {
+ parseState.iconOnlyString = val;
+ }
+ }
+ }
+
+ private String tryToReadAsString(byte[] bytes) {
+ if (bytes.length < 2) {
+ return null;
+ }
+ //act like this is a null terminated unicode le
+ int stringLen = (bytes.length - 2) / 2;
+ try {
+ return StringUtil.getFromUnicodeLE0Terminated(bytes, 0, stringLen);
+ } catch (SecurityException e) {
+ throw e;
+ } catch (Exception e) {
+ //didn't work out...oh, well
+ }
+ return null;
+ }
+
private void handleWMF(byte[] bytes, ContentHandler contentHandler,
EmbeddedDocumentExtractor embeddedDocumentExtractor)
throws IOException, SAXException, TikaException {
@@ -173,4 +225,15 @@ public class EMFParser extends AbstractParser {
handleEmbedded(dataFormat.getRawData(), embeddedDocumentExtractor, handler);
}
}
+
+ private static class ParseState {
+ double lastY = -1;
+ double lastX = -1;
+ boolean hitIconOnly = false;
+ boolean lastWasIconOnly = false;
+ boolean isIconOnly = false;
+ String iconOnlyString = null;
+
+ EmbeddedDocumentExtractor extractor;
+ }
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
index ec64874bc..8a442383b 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
@@ -573,6 +573,7 @@ public class HSLFExtractor extends AbstractPOIFSExtractor {
private void handleDataStream(InputStream dataStream, String objID, String progId,
XHTMLContentHandler xhtml) {
+ //TODO -- inject progId into the metadata of the embedded file
try (TikaInputStream stream = TikaInputStream.get(dataStream)) {
String mediaType = null;
if ("Excel.Chart.8".equals(progId)) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index 5a85d02cb..0493a2bd8 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -23,6 +23,7 @@ import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
+import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
@@ -56,6 +57,7 @@ import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.microsoft.OfficeParser;
@@ -64,6 +66,7 @@ import org.apache.tika.parser.microsoft.OfficeParserConfig;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.ExceptionUtils;
+import org.apache.tika.utils.StringUtils;
import org.apache.tika.utils.XMLReaderUtils;
/**
@@ -135,7 +138,7 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
buildXHTML(xhtml);
// Now do any embedded parts
- handleEmbeddedParts(xhtml, metadata);
+ handleEmbeddedParts(xhtml, metadata, getEmbeddedPartMetadataMap());
// thumbnail
handleThumbnail(xhtml, metadata);
@@ -143,6 +146,10 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
xhtml.endDocument();
}
+ protected Map<String, EmbeddedPartMetadata> getEmbeddedPartMetadataMap() {
+ return Collections.emptyMap();
+ }
+
protected String getJustFileName(String desc) {
int idx = desc.lastIndexOf('/');
if (idx != -1) {
@@ -199,7 +206,8 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
}
}
- private void handleEmbeddedParts(XHTMLContentHandler xhtml, Metadata metadata)
+ private void handleEmbeddedParts(XHTMLContentHandler xhtml, Metadata metadata,
+ Map<String, EmbeddedPartMetadata> embeddedPartMetadataMap)
throws TikaException, IOException, SAXException {
//keep track of media items that have been handled
//there can be multiple relationships pointing to the
@@ -214,7 +222,8 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
}
for (PackageRelationship rel : source.getRelationships()) {
try {
- handleEmbeddedPart(source, rel, xhtml, metadata, handledTarget);
+ handleEmbeddedPart(source, rel, xhtml, metadata,
+ embeddedPartMetadataMap, handledTarget);
} catch (SAXException | SecurityException e) {
throw e;
} catch (Exception e) {
@@ -229,6 +238,7 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
private void handleEmbeddedPart(PackagePart source, PackageRelationship rel,
XHTMLContentHandler xhtml, Metadata parentMetadata,
+ Map<String, EmbeddedPartMetadata> embeddedPartMetadataMap,
Set<String> handledTarget)
throws IOException, SAXException, TikaException, InvalidFormatException {
URI targetURI = rel.getTargetURI();
@@ -260,19 +270,28 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
} catch (IllegalArgumentException ex) {
return;
}
-
+ EmbeddedPartMetadata embeddedPartMetadata = embeddedPartMetadataMap.get(rel.getId());
String type = rel.getRelationshipType();
if (POIXMLDocument.OLE_OBJECT_REL_TYPE.equals(type) &&
TYPE_OLE_OBJECT.equals(target.getContentType())) {
- handleEmbeddedOLE(target, xhtml, sourceDesc + rel.getId(), parentMetadata);
+ handleEmbeddedOLE(target, xhtml, sourceDesc + rel.getId(), parentMetadata,
+ embeddedPartMetadata);
+ if (targetURI != null) {
+ handledTarget.add(targetURI.toString());
+ }
+ } else if (PackageRelationshipTypes.IMAGE_PART.equals(type)) {
+ handleEmbeddedFile(target, xhtml, sourceDesc + rel.getId(),
+ embeddedPartMetadata, TikaCoreProperties.EmbeddedResourceType.INLINE);
if (targetURI != null) {
handledTarget.add(targetURI.toString());
}
} else if (RELATION_MEDIA.equals(type) || RELATION_VIDEO.equals(type) ||
- RELATION_AUDIO.equals(type) || PackageRelationshipTypes.IMAGE_PART.equals(type) ||
+ RELATION_AUDIO.equals(type) ||
POIXMLDocument.PACK_OBJECT_REL_TYPE.equals(type) ||
POIXMLDocument.OLE_OBJECT_REL_TYPE.equals(type)) {
- handleEmbeddedFile(target, xhtml, sourceDesc + rel.getId());
+ handleEmbeddedFile(target, xhtml, sourceDesc + rel.getId(),
+ embeddedPartMetadata,
+ TikaCoreProperties.EmbeddedResourceType.ATTACHMENT);
if (targetURI != null) {
handledTarget.add(targetURI.toString());
}
@@ -289,7 +308,9 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
* Handles an embedded OLE object in the document
*/
private void handleEmbeddedOLE(PackagePart part, XHTMLContentHandler xhtml, String rel,
- Metadata parentMetadata) throws IOException, SAXException {
+ Metadata parentMetadata,
+ EmbeddedPartMetadata embeddedPartMetadata) throws IOException,
+ SAXException {
// A POIFSFileSystem needs to be at least 3 blocks big to be valid
if (part.getSize() >= 0 && part.getSize() < 512 * 3) {
// Too small, skip
@@ -308,6 +329,8 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
TikaInputStream stream = null;
try {
Metadata metadata = new Metadata();
+ metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+ TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.name());
metadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, rel);
DirectoryNode root = fs.getRoot();
@@ -315,10 +338,8 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
String packageEntryName = getPackageEntryName(root);
if (packageEntryName != null) {
- // TIKA-704: OLE 2.0 embedded non-Office document?
- //TODO: figure out if the equivalent of OLE 1.0's
- //getCommand() and getFileName() exist for OLE 2.0 to populate
- //TikaCoreProperties.ORIGINAL_RESOURCE_NAME
+ //OLE 2.0
+ updateMetadata(metadata, embeddedPartMetadata);
stream = TikaInputStream.get(fs.createDocumentInputStream(packageEntryName));
if (embeddedExtractor.shouldParseEmbedded(metadata)) {
embeddedExtractor
@@ -348,7 +369,8 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
true);
}
} else {
- handleEmbeddedFile(part, xhtml, rel);
+ handleEmbeddedFile(part, xhtml, rel, embeddedPartMetadata,
+ TikaCoreProperties.EmbeddedResourceType.ATTACHMENT);
}
} catch (FileNotFoundException e) {
// There was no CONTENTS entry, so skip this part
@@ -366,6 +388,16 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
}
}
+ private void updateMetadata(Metadata metadata, EmbeddedPartMetadata embeddedPartMetadata) {
+ if (embeddedPartMetadata == null) {
+ return;
+ }
+ if (! StringUtils.isBlank(embeddedPartMetadata.getProgId())) {
+ metadata.set(Office.PROG_ID, embeddedPartMetadata.getProgId());
+ }
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, embeddedPartMetadata.getFullName());
+ }
+
private String getPackageEntryName(DirectoryNode root) {
if (root.hasEntry("\u0001Ole")) {
//we used to require this too: root.hasEntry("\u0001CompObj") before TIKA-3526
@@ -386,15 +418,18 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
/**
* Handles an embedded file in the document
*/
- protected void handleEmbeddedFile(PackagePart part, XHTMLContentHandler xhtml, String rel)
+ protected void handleEmbeddedFile(PackagePart part, XHTMLContentHandler xhtml,
+ String rel,
+ EmbeddedPartMetadata embeddedPartMetadata,
+ TikaCoreProperties.EmbeddedResourceType embeddedResourceType)
throws SAXException, IOException {
Metadata metadata = new Metadata();
metadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, rel);
+ metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+ embeddedResourceType.name());
// Get the name
- String name = part.getPartName().getName();
- metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
- name.substring(name.lastIndexOf('/') + 1));
+ updateResourceName(part, embeddedPartMetadata, metadata);
// Get the content type
metadata.set(Metadata.CONTENT_TYPE, part.getContentType());
@@ -408,6 +443,28 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
}
}
+ private void updateResourceName(PackagePart part, EmbeddedPartMetadata embeddedPartMetadata,
+ Metadata metadata) {
+
+ if (embeddedPartMetadata != null) {
+ if (! StringUtils.isBlank(embeddedPartMetadata.getProgId())) {
+ metadata.set(Office.PROG_ID, embeddedPartMetadata.getProgId());
+ }
+ String fullName = embeddedPartMetadata.getFullName();
+ if (!StringUtils.isBlank(fullName)) {
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, fullName);
+ return;
+ }
+ }
+ //TODO -- should we record the literal name of the embedded file?
+ String name = part.getPartName().getName();
+ int lastSlash = name.lastIndexOf('/');
+ if (lastSlash > -1) {
+ name = name.substring(lastSlash + 1);
+ }
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
+ }
+
/**
* Populates the {@link XHTMLContentHandler} object received as parameter.
*/
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/EmbeddedPartMetadata.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/EmbeddedPartMetadata.java
new file mode 100644
index 000000000..1e26aa1f9
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/EmbeddedPartMetadata.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+/**
+ * This class records metadata about embedded parts that exists in the xml
+ * of the main document.
+ */
+public class EmbeddedPartMetadata {
+
+ private final String emfRelationshipId;
+ private String renderedName;
+ private String fullName;
+
+ private String progId;
+
+ //This is the rId of the EMF file that is associated with
+ //the embedded object
+
+ /**
+ *
+ * @param emfRelationshipId relationship id of the EMF file
+ */
+ public EmbeddedPartMetadata(String emfRelationshipId) {
+ this.emfRelationshipId = emfRelationshipId;
+ }
+
+ public String getEmfRelationshipId() {
+ return emfRelationshipId;
+ }
+
+ public String getRenderedName() {
+ return renderedName;
+ }
+
+ public String getFullName() {
+ return fullName;
+ }
+
+ public String getProgId() {
+ return progId;
+ }
+
+ public void setRenderedName(String renderedName) {
+ this.renderedName = renderedName;
+ }
+
+ public void setFullName(String fullName) {
+ this.fullName = fullName;
+ }
+
+ public void setProgId(String progId) {
+ this.progId = progId;
+ }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
index e5aacee15..efe092730 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
@@ -17,13 +17,17 @@
package org.apache.tika.parser.microsoft.ooxml;
import java.io.IOException;
+import java.io.InputStream;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Deque;
import java.util.HashMap;
import java.util.List;
+import java.util.Map;
import javax.xml.namespace.QName;
+import com.microsoft.schemas.vml.impl.CTShapeImpl;
+import org.apache.poi.ooxml.POIXMLDocumentPart;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
@@ -65,10 +69,13 @@ import org.xml.sax.helpers.AttributesImpl;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.microsoft.EMFParser;
import org.apache.tika.parser.microsoft.FormattingUtils;
import org.apache.tika.parser.microsoft.WordExtractor;
import org.apache.tika.parser.microsoft.WordExtractor.TagAndStyle;
+import org.apache.tika.sax.ToTextContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.StringUtils;
public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
@@ -90,6 +97,11 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
private XWPFStyles styles;
private Metadata metadata;
+ //This is a map of the rIds of embedded files and the EmbeddedPartMetadata
+ //that may be associated with the embedded file. The EmbeddedPartMetadata
+ //is populated during the parse of the main document.
+ private Map<String, EmbeddedPartMetadata> embeddedPartMetadataMap = new HashMap<>();
+
public XWPFWordExtractorDecorator(Metadata metadata, ParseContext context,
XWPFWordExtractor extractor) {
super(context, extractor);
@@ -144,6 +156,11 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
}
}
+ @Override
+ protected Map<String, EmbeddedPartMetadata> getEmbeddedPartMetadataMap() {
+ return embeddedPartMetadataMap;
+ }
+
private void extractIBodyText(IBody bodyElement, XWPFListManager listManager,
XHTMLContentHandler xhtml)
throws SAXException, XmlException, IOException {
@@ -209,43 +226,9 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
}
writeParagraphNumber(paragraph, listManager, xhtml);
- // Output placeholder for any embedded docs:
-
- // TODO: replace w/ XPath/XQuery:
- for (XWPFRun run : paragraph.getRuns()) {
- XmlCursor c = run.getCTR().newCursor();
- c.selectPath("./*");
- while (c.toNextSelection()) {
- XmlObject o = c.getObject();
- if (o instanceof CTObject) {
- XmlCursor c2 = o.newCursor();
- c2.selectPath("./*");
- while (c2.toNextSelection()) {
- XmlObject o2 = c2.getObject();
-
- XmlObject embedAtt = o2.selectAttribute(new QName("Type"));
- if (embedAtt != null &&
- embedAtt.getDomNode().getNodeValue().equals("Embed")) {
- // Type is "Embed"
- XmlObject relIDAtt = o2.selectAttribute(new QName(
- "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
- "id"));
- if (relIDAtt != null) {
- String relID = relIDAtt.getDomNode().getNodeValue();
- AttributesImpl attributes = new AttributesImpl();
- attributes.addAttribute("", "class", "class", "CDATA", "embedded");
- attributes.addAttribute("", "id", "id", "CDATA", relID);
- xhtml.startElement("div", attributes);
- xhtml.endElement("div");
- }
- }
- }
- c2.dispose();
- }
- }
- c.dispose();
- }
+ // Output placeholder for any embedded docs:
+ processEmbeddedObjects(paragraph.getRuns(), xhtml);
// Attach bookmarks for the paragraph
// (In future, we might put them in the right place, for now
@@ -339,6 +322,112 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
}
}
+ private void processEmbeddedObjects(List<XWPFRun> runs, XHTMLContentHandler xhtml)
+ throws SAXException {
+ // TODO: replace w/ XPath/XQuery:
+ for (XWPFRun run : runs) {
+ try (XmlCursor c = run.getCTR().newCursor()) {
+ c.selectPath("./*");
+ while (c.toNextSelection()) {
+ XmlObject o = c.getObject();
+ if (o instanceof CTObject) {
+ try (XmlCursor objectCursor = o.newCursor()) {
+ processObject(objectCursor, xhtml);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ private void processObject(XmlCursor cursor, XHTMLContentHandler xhtml) throws SAXException {
+
+ cursor.selectPath("./*");
+ String objectRelId = null;
+ String progId = null;
+ EmbeddedPartMetadata embeddedPartMetadata = null;
+ while (cursor.toNextSelection()) {
+ XmlObject o2 = cursor.getObject();
+ XmlObject embedAtt = o2.selectAttribute(new QName("Type"));
+ if (embedAtt != null &&
+ embedAtt.getDomNode().getNodeValue().equals("Embed")) {
+ //TODO: get ProgID, while we're here?
+ // Type is "Embed"
+ XmlObject relIDAtt = o2.selectAttribute(new QName(
+ "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
+ "id"));
+ if (relIDAtt != null) {
+ objectRelId = relIDAtt.getDomNode().getNodeValue();
+ }
+
+ XmlObject progIDAtt = o2.selectAttribute(new QName("ProgID"));
+ if (progIDAtt != null) {
+ progId = progIDAtt.getDomNode().getNodeValue();
+ }
+ } else if (o2 instanceof CTShapeImpl) {
+ XmlObject[] imagedata = o2.selectChildren(
+ new QName("urn:schemas" +
+ "-microsoft-com:vml","imagedata"));
+ if (imagedata.length > 0) {
+ XmlObject relIDAtt = imagedata[0].selectAttribute(new QName(
+ "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
+ "id"));
+ if (relIDAtt != null) {
+ String rid = relIDAtt.getDomNode().getNodeValue();
+ embeddedPartMetadata = new EmbeddedPartMetadata(rid);
+ tryToParseEmbeddedName(rid, embeddedPartMetadata);
+ }
+ }
+ }
+ }
+ if (objectRelId == null) {
+ return;
+ }
+ if (! StringUtils.isBlank(progId)) {
+ embeddedPartMetadata.setProgId(progId);
+ }
+
+ if (embeddedPartMetadata != null) {
+ embeddedPartMetadataMap.put(objectRelId, embeddedPartMetadata);
+ }
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+ attributes.addAttribute("", "id", "id", "CDATA", objectRelId);
+ if (!StringUtils.isBlank(embeddedPartMetadata.getFullName())) {
+ attributes.addAttribute("", "name", "name", "CDATA",
+ embeddedPartMetadata.getFullName());
+ }
+ xhtml.startElement("div", attributes);
+ xhtml.endElement("div");
+ }
+
+ private String tryToParseEmbeddedName(String rid, EmbeddedPartMetadata embeddedPartMetadata) {
+ //This tries to parse the embedded name out of a comment
+ //field in an emf
+ POIXMLDocumentPart part = document.getRelationById(rid);
+ if (part == null || part.getPackagePart() == null
+ || part.getPackagePart().getContentType() == null) {
+ return null;
+ }
+ PackagePart packagePart = part.getPackagePart();
+ if ("image/x-emf".equals(packagePart.getContentType())) {
+ try (InputStream is = packagePart.getInputStream()) {
+ EMFParser p = new EMFParser();
+ Metadata m = new Metadata();
+ ParseContext pc = new ParseContext();
+ ToTextContentHandler toTextContentHandler = new ToTextContentHandler();
+ p.parse(is, toTextContentHandler, m, pc);
+ embeddedPartMetadata.setRenderedName(toTextContentHandler.toString().trim());
+ embeddedPartMetadata.setFullName(m.get(EMFParser.EMF_ICON_STRING));
+ } catch (SecurityException e) {
+ throw e;
+ } catch (Exception e) {
+ //we tried
+ }
+ }
+ return null;
+ }
+
private void writeParagraphNumber(XWPFParagraph paragraph, XWPFListManager listManager,
XHTMLContentHandler xhtml) throws SAXException {
if (paragraph.getNumIlvl() == null) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/EMFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/EMFParserTest.java
index 318976e80..ec4297aba 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/EMFParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/EMFParserTest.java
@@ -41,6 +41,18 @@ public class EMFParserTest extends TikaTest {
emfMetadata.get(TikaCoreProperties.TIKA_CONTENT));
}
+ @Test
+ public void testIconOnly() throws Exception {
+ String fullFileName = "some word doc with a very long name that should be wrapped.docx";
+ //test file contributed by Ross Spencer on TIKA-3968
+ List<Metadata> metadataList = getRecursiveMetadata("testEMF_iconOnlyLongFilename.emf");
+ assertEquals("true", metadataList.get(0).get(EMFParser.EMF_ICON_ONLY));
+ assertEquals(fullFileName, metadataList.get(0).get(EMFParser.EMF_ICON_STRING));
+ assertContains("some word doc", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
+ assertContains("a very long name that should be wrapped.docx",
+ metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
+ }
+
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
index 21e87b3bd..dfe86f204 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
@@ -300,7 +300,7 @@ public class OOXMLContainerExtractionTest extends AbstractPOIContainerExtraction
assertEquals("image1.emf", handler.filenames.get(0));
assertEquals(TYPE_EMF, handler.mediaTypes.get(0));
- assertNull(handler.filenames.get(1));
+ assertEquals("Acrobat Document", handler.filenames.get(1));
assertEquals(TYPE_PDF, handler.mediaTypes.get(1));
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testEMF_iconOnlyLongFilename.emf b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testEMF_iconOnlyLongFilename.emf
new file mode 100644
index 000000000..96ea7879b
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testEMF_iconOnlyLongFilename.emf differ
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testWORD_EMFAndAttachments.docx b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testWORD_EMFAndAttachments.docx
new file mode 100644
index 000000000..86e4b4541
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testWORD_EMFAndAttachments.docx differ
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index b1c63cd72..65f14f169 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -16,6 +16,9 @@
*/
package org.apache.tika.parser.microsoft.ooxml;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNull;
+
import java.util.List;
import org.junit.jupiter.api.Disabled;
@@ -23,8 +26,10 @@ import org.junit.jupiter.api.Test;
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.microsoft.EMFParser;
import org.apache.tika.parser.microsoft.OfficeParserConfig;
public class OOXMLParserTest extends TikaTest {
@@ -60,6 +65,57 @@ public class OOXMLParserTest extends TikaTest {
assertContains("Hello World", pdfMetadata2.get(TikaCoreProperties.TIKA_CONTENT));
}
+ @Test
+ public void testEMFAssociatedWithAttachments() throws Exception {
+ //TIKA-3968
+ List<Metadata> metadataList = getRecursiveMetadata("testWORD_EMFAndAttachments.docx");
+
+ assertEquals("true", metadataList.get(1).get(EMFParser.EMF_ICON_ONLY));
+ assertEquals("true", metadataList.get(3).get(EMFParser.EMF_ICON_ONLY));
+ assertEquals("true", metadataList.get(5).get(EMFParser.EMF_ICON_ONLY));
+ assertEquals("TestText.txt", metadataList.get(1).get(EMFParser.EMF_ICON_STRING));
+ assertEquals("TestPdf.pdf", metadataList.get(3).get(EMFParser.EMF_ICON_STRING));
+ assertEquals("testWORD123.docx", metadataList.get(5).get(EMFParser.EMF_ICON_STRING));
+
+ assertNull(metadataList.get(2).get(Office.PROG_ID));
+ assertEquals("AcroExch.Document.DC", metadataList.get(4).get(Office.PROG_ID));
+ assertEquals("Word.Document.12", metadataList.get(6).get(Office.PROG_ID));
+
+ assertEquals("TestText.txt", metadataList.get(2).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+ assertEquals("TestPdf.pdf", metadataList.get(4).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+ assertEquals("testWORD123.docx", metadataList.get(6).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+
+ assertEquals("/TestText.txt",
+ metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
+ assertEquals("/TestPdf.pdf",
+ metadataList.get(4).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
+ assertEquals("/testWORD123.docx",
+ metadataList.get(6).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
+
+ assertContains("This is Text File",
+ metadataList.get(2).get(TikaCoreProperties.TIKA_CONTENT));
+
+ assertContains("This is test PDF document for parser.",
+ metadataList.get(4).get(TikaCoreProperties.TIKA_CONTENT));
+
+ assertContains("This is test word document for parser.",
+ metadataList.get(6).get(TikaCoreProperties.TIKA_CONTENT));
+
+ assertEquals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.name(),
+ metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+ assertEquals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.name(),
+ metadataList.get(4).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+ assertEquals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.name(),
+ metadataList.get(6).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+
+ assertEquals(TikaCoreProperties.EmbeddedResourceType.INLINE.name(),
+ metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+ assertEquals(TikaCoreProperties.EmbeddedResourceType.INLINE.name(),
+ metadataList.get(3).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+ assertEquals(TikaCoreProperties.EmbeddedResourceType.INLINE.name(),
+ metadataList.get(5).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+ }
+
@Disabled("TODO figure out why this doesn't work")
@Test//(expected = org.apache.tika.exception.TikaException.class)
public void testCorruptedZip() throws Exception {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
new file mode 100644
index 000000000..ef76cc641
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNull;
+
+import java.util.List;
+
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Disabled;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.microsoft.EMFParser;
+import org.apache.tika.parser.microsoft.OfficeParserConfig;
+
+public class SXWPFExtractorTest extends TikaTest {
+
+ private ParseContext parseContext;
+
+ @BeforeEach
+ public void setUp() {
+ parseContext = new ParseContext();
+ OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+ officeParserConfig.setUseSAXDocxExtractor(true);
+ officeParserConfig.setUseSAXPptxExtractor(true);
+ parseContext.set(OfficeParserConfig.class, officeParserConfig);
+
+ }
+ @Test
+ @Disabled("TODO -- implement TIKA-3968 for SXWPFExtractor")
+ public void testEMFAssociatedWithAttachments() throws Exception {
+ //TIKA-3968
+ List<Metadata> metadataList = getRecursiveMetadata("testWORD_EMFAndAttachments.docx", parseContext);
+
+ assertEquals("true", metadataList.get(1).get(EMFParser.EMF_ICON_ONLY));
+ assertEquals("true", metadataList.get(3).get(EMFParser.EMF_ICON_ONLY));
+ assertEquals("true", metadataList.get(5).get(EMFParser.EMF_ICON_ONLY));
+ assertEquals("TestText.txt", metadataList.get(1).get(EMFParser.EMF_ICON_STRING));
+ assertEquals("TestPdf.pdf", metadataList.get(3).get(EMFParser.EMF_ICON_STRING));
+ assertEquals("testWORD123.docx", metadataList.get(5).get(EMFParser.EMF_ICON_STRING));
+
+ assertNull(metadataList.get(2).get(Office.PROG_ID));
+ assertEquals("AcroExch.Document.DC", metadataList.get(4).get(Office.PROG_ID));
+ assertEquals("Word.Document.12", metadataList.get(6).get(Office.PROG_ID));
+
+ assertEquals("TestText.txt", metadataList.get(2).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+ assertEquals("TestPdf.pdf", metadataList.get(4).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+ assertEquals("testWORD123.docx", metadataList.get(6).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+
+ assertEquals("/TestText.txt",
+ metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
+ assertEquals("/TestPdf.pdf",
+ metadataList.get(4).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
+ assertEquals("/testWORD123.docx",
+ metadataList.get(6).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
+
+ assertContains("This is Text File",
+ metadataList.get(2).get(TikaCoreProperties.TIKA_CONTENT));
+
+ assertContains("This is test PDF document for parser.",
+ metadataList.get(4).get(TikaCoreProperties.TIKA_CONTENT));
+
+ assertContains("This is test word document for parser.",
+ metadataList.get(6).get(TikaCoreProperties.TIKA_CONTENT));
+
+ assertEquals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.name(),
+ metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+ assertEquals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.name(),
+ metadataList.get(4).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+ assertEquals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.name(),
+ metadataList.get(6).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+
+ assertEquals(TikaCoreProperties.EmbeddedResourceType.INLINE.name(),
+ metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+ assertEquals(TikaCoreProperties.EmbeddedResourceType.INLINE.name(),
+ metadataList.get(3).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+ assertEquals(TikaCoreProperties.EmbeddedResourceType.INLINE.name(),
+ metadataList.get(5).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+ }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index bac5fc6a9..4e034fd47 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -202,7 +202,7 @@ public class PDFParserTest extends TikaTest {
assertEquals(3, tracker.filenames.size());
assertEquals(3, tracker.mediaTypes.size());
assertEquals("image1.emf", tracker.filenames.get(0));
- assertNull(tracker.filenames.get(1));
+ assertEquals("attached.pdf", tracker.filenames.get(1));
assertEquals("Test.docx", tracker.filenames.get(2));
assertEquals(TYPE_EMF, tracker.mediaTypes.get(0));
assertEquals(TYPE_PDF, tracker.mediaTypes.get(1));