You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/04/08 19:19:51 UTC
[tika] 02/02: TIKA-3711 -- allow configuration of EmbeddedDocumentExtractors via tika-config.xml -- review and correct places where outputHtml should be false.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 6552b076f0b4987423710b72b8917150422ea112
Author: tallison <ta...@apache.org>
AuthorDate: Fri Apr 8 15:19:35 2022 -0400
TIKA-3711 -- allow configuration of EmbeddedDocumentExtractors via tika-config.xml -- review and correct places where outputHtml should be false.
---
CHANGES.txt | 6 ++++
.../parser/microsoft/AbstractPOIFSExtractor.java | 33 ++++++++++++----------
.../tika/parser/microsoft/ExcelExtractor.java | 2 +-
.../tika/parser/microsoft/HSLFExtractor.java | 9 +++---
.../tika/parser/microsoft/JackcessExtractor.java | 2 +-
.../tika/parser/microsoft/OutlookExtractor.java | 2 +-
.../tika/parser/microsoft/WordExtractor.java | 2 +-
.../microsoft/onenote/OneNoteTreeWalker.java | 6 ++--
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 2 +-
.../parser/microsoft/pst/OutlookPSTParser.java | 2 +-
.../tika/parser/microsoft/xml/WordMLParser.java | 8 +++---
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 13 +++++----
.../tika/parser/pdf/ImageGraphicsEngine.java | 4 +--
.../tika/parser/microsoft/XML2003ParserTest.java | 4 +--
.../parser/microsoft/ooxml/OOXMLParserTest.java | 26 -----------------
.../org/apache/tika/parser/pkg/ZipParserTest.java | 25 ++++++++++++++++
16 files changed, 77 insertions(+), 69 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 7472cbed1..21203d8e4 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -18,6 +18,12 @@ Release 2.4.0 - ???
* Add a fetcher and emitter for Azure blob storage (TIKA-3707).
+ * Allow configurability of the EmbeddedDocumentExtractor used
+ by the AutoDetectParser (TIKA-3711).
+
+ * Fixed regression in 2.3.0 that led to more embedded filenames
+ than appropriate being written to the content (TIKA-3711).
+
* tika-server now clones forking process' environment variables
into forked process (TIKA-3715).
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
index f5fc6d8f4..7a14733e8 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
@@ -143,16 +143,17 @@ abstract class AbstractPOIFSExtractor {
/**
* Handle an office document that's embedded at the POIFS level
*/
- protected void handleEmbeddedOfficeDoc(DirectoryEntry dir, XHTMLContentHandler xhtml)
+ protected void handleEmbeddedOfficeDoc(DirectoryEntry dir, XHTMLContentHandler xhtml,
+ boolean outputHtml)
throws IOException, SAXException, TikaException {
- handleEmbeddedOfficeDoc(dir, null, xhtml);
+ handleEmbeddedOfficeDoc(dir, null, xhtml, outputHtml);
}
/**
* Handle an office document that's embedded at the POIFS level
*/
protected void handleEmbeddedOfficeDoc(DirectoryEntry dir, String resourceName,
- XHTMLContentHandler xhtml)
+ XHTMLContentHandler xhtml, boolean outputHtml)
throws IOException, SAXException, TikaException {
@@ -181,7 +182,7 @@ abstract class AbstractPOIFSExtractor {
return;
}
handleEmbeddedResource(stream, metadata,null, dir.getName(), dir.getStorageClsid(),
- type.toString(), xhtml, true);
+ type.toString(), xhtml, outputHtml);
return;
}
}
@@ -198,19 +199,19 @@ abstract class AbstractPOIFSExtractor {
POIFSDocumentType type = POIFSDocumentType.detectType(dir);
String rName = (resourceName == null) ? dir.getName() : resourceName;
if (type == POIFSDocumentType.OLE10_NATIVE) {
- handleOLENative(dir, type, rName, metadata, xhtml);
+ handleOLENative(dir, type, rName, metadata, xhtml, outputHtml);
} else if (type == POIFSDocumentType.COMP_OBJ) {
- handleCompObj(dir, type, rName, metadata, xhtml);
+ handleCompObj(dir, type, rName, metadata, xhtml, outputHtml);
} else {
metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
rName + '.' + type.getExtension());
- parseEmbedded(dir, xhtml, metadata);
+ parseEmbedded(dir, xhtml, metadata, outputHtml);
}
}
private void handleCompObj(DirectoryEntry dir, POIFSDocumentType type, String rName,
- Metadata metadata, XHTMLContentHandler xhtml)
+ Metadata metadata, XHTMLContentHandler xhtml, boolean outputHtml)
throws IOException, SAXException {
//TODO: figure out if the equivalent of OLE 1.0's
//getCommand() and getFileName() exist for OLE 2.0 to populate
@@ -253,7 +254,7 @@ abstract class AbstractPOIFSExtractor {
metadata.set(Metadata.CONTENT_TYPE, mediaType.getType());
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, rName + extension);
metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(length));
- parseEmbedded(dir, tis, xhtml, metadata);
+ parseEmbedded(dir, tis, xhtml, metadata, outputHtml);
} finally {
inp.close();
}
@@ -261,7 +262,7 @@ abstract class AbstractPOIFSExtractor {
private void handleOLENative(DirectoryEntry dir, POIFSDocumentType type, String rName,
- Metadata metadata, XHTMLContentHandler xhtml)
+ Metadata metadata, XHTMLContentHandler xhtml, boolean outputHtml)
throws IOException, SAXException {
byte[] data = null;
try {
@@ -289,12 +290,13 @@ abstract class AbstractPOIFSExtractor {
return;
}
try (TikaInputStream tis = TikaInputStream.get(data)) {
- parseEmbedded(dir, tis, xhtml, metadata);
+ parseEmbedded(dir, tis, xhtml, metadata, outputHtml);
}
}
private void parseEmbedded(DirectoryEntry dir, TikaInputStream tis, XHTMLContentHandler xhtml,
- Metadata metadata) throws IOException, SAXException {
+ Metadata metadata, boolean outputHtml) throws IOException,
+ SAXException {
if (!embeddedDocumentUtil.shouldParseEmbedded(metadata)) {
return;
}
@@ -302,10 +304,11 @@ abstract class AbstractPOIFSExtractor {
metadata.set(TikaCoreProperties.EMBEDDED_STORAGE_CLASS_ID,
dir.getStorageClsid().toString());
}
- embeddedDocumentUtil.parseEmbedded(tis, xhtml, metadata, true);
+ embeddedDocumentUtil.parseEmbedded(tis, xhtml, metadata, outputHtml);
}
- private void parseEmbedded(DirectoryEntry dir, XHTMLContentHandler xhtml, Metadata metadata)
+ private void parseEmbedded(DirectoryEntry dir, XHTMLContentHandler xhtml, Metadata metadata,
+ boolean outputHtml)
throws IOException, SAXException {
if (!embeddedDocumentUtil.shouldParseEmbedded(metadata)) {
return;
@@ -316,7 +319,7 @@ abstract class AbstractPOIFSExtractor {
metadata.set(TikaCoreProperties.EMBEDDED_STORAGE_CLASS_ID,
dir.getStorageClsid().toString());
}
- embeddedDocumentUtil.parseEmbedded(tis, xhtml, metadata, true);
+ embeddedDocumentUtil.parseEmbedded(tis, xhtml, metadata, outputHtml);
}
}
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
index f9e0cca0f..5bbc5257d 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
@@ -192,7 +192,7 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
for (Entry entry : root) {
if (entry.getName().startsWith("MBD") && entry instanceof DirectoryEntry) {
try {
- handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
+ handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml, true);
} catch (TikaException e) {
// ignore parse errors from embedded documents
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
index addc1825d..8c4a53059 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
@@ -155,7 +155,7 @@ public class HSLFExtractor extends AbstractPOIFSExtractor {
}
handleSlideEmbeddedPictures(ss, xhtml);
- handleShowEmbeddedResources(ss, xhtml);
+ handleShowEmbeddedResources(ss, xhtml, true);
if (officeParserConfig.isExtractMacros()) {
extractMacros(ss, xhtml);
@@ -172,7 +172,8 @@ public class HSLFExtractor extends AbstractPOIFSExtractor {
* them in the shapes in the slides, headers/footers, etc, try to
* extract them here.
**/
- private void handleShowEmbeddedResources(HSLFSlideShow ss, XHTMLContentHandler xhtml)
+ private void handleShowEmbeddedResources(HSLFSlideShow ss, XHTMLContentHandler xhtml,
+ boolean outputHtml)
throws SAXException {
HSLFObjectData[] objectData = ss.getEmbeddedObjects();
@@ -193,7 +194,7 @@ public class HSLFExtractor extends AbstractPOIFSExtractor {
if (pfs.getRoot().getEntryNames().size() < 1) {
return;
}
- handleEmbeddedOfficeDoc(pfs.getRoot(), filename, xhtml);
+ handleEmbeddedOfficeDoc(pfs.getRoot(), filename, xhtml, outputHtml);
}
} else {
boolean shouldProcess = false;
@@ -582,7 +583,7 @@ public class HSLFExtractor extends AbstractPOIFSExtractor {
throw new IOException(e);
}
try {
- handleEmbeddedOfficeDoc(poifs.getRoot(), objID, xhtml);
+ handleEmbeddedOfficeDoc(poifs.getRoot(), objID, xhtml, false);
} finally {
if (poifs != null) {
poifs.close();
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
index a6d54edb3..a224fbca3 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
@@ -386,7 +386,7 @@ class JackcessExtractor extends AbstractPOIFSExtractor {
return;
}
- handleEmbeddedOfficeDoc(fileSystem.getRoot(), xhtml);
+ handleEmbeddedOfficeDoc(fileSystem.getRoot(), xhtml, true);
} finally {
if (fileSystem != null) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index ef7f16f23..a7983d7b4 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -286,7 +286,7 @@ public class OutlookExtractor extends AbstractPOIFSExtractor {
}
if (attachment.getAttachmentDirectory() != null) {
handleEmbeddedOfficeDoc(attachment.getAttachmentDirectory().getDirectory(),
- xhtml);
+ xhtml, true);
}
xhtml.endElement("div");
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
index 781607359..95bb4a20e 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
@@ -221,7 +221,7 @@ public class WordExtractor extends AbstractPOIFSExtractor {
DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool");
for (Entry entry : op) {
if (entry.getName().startsWith("_") && entry instanceof DirectoryEntry) {
- handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
+ handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml, true);
}
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java
index 3dbe8e780..f5738bb19 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java
@@ -345,13 +345,13 @@ class OneNoteTreeWalker {
}
Metadata embeddedMetadata = new Metadata();
try {
- stream = TikaInputStream.get(buf.array());
- embeddedDocumentExtractor.parseEmbedded(stream, new EmbeddedContentHandler(xhtml),
- embeddedMetadata, true);
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "embedded");
xhtml.startElement("div", attributes);
xhtml.endElement("div");
+ stream = TikaInputStream.get(buf.array());
+ embeddedDocumentExtractor.parseEmbedded(stream, new EmbeddedContentHandler(xhtml),
+ embeddedMetadata, false);
} finally {
IOUtils.closeQuietly(stream);
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index b922128ba..5a85d02cb 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -184,7 +184,7 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
if (embeddedExtractor.shouldParseEmbedded(thumbnailMetadata)) {
embeddedExtractor.parseEmbedded(TikaInputStream.get(tStream),
- new EmbeddedContentHandler(handler), thumbnailMetadata, true);
+ new EmbeddedContentHandler(handler), thumbnailMetadata, false);
}
tStream.close();
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java
index 9beffc246..8379d671a 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java
@@ -273,7 +273,7 @@ public class OutlookPSTParser extends AbstractParser {
}
try {
- embeddedExtractor.parseEmbedded(tis, xhtml, attachMeta, true);
+ embeddedExtractor.parseEmbedded(tis, xhtml, attachMeta, false);
} finally {
tis.close();
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java
index 6b4851b0a..0590568d2 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java
@@ -255,7 +255,7 @@ public class WordMLParser extends AbstractXML2003Parser {
}
handler.startElement(XHTMLContentHandler.XHTML, IMG, IMG, attrs);
handler.endElement(XHTMLContentHandler.XHTML, IMG, IMG);
- handleEmbedded();
+ handleEmbedded(false);
} else if (BIN_DATA.equals(localName)) {
inBin = false;
boolean success = false;
@@ -269,12 +269,12 @@ public class WordMLParser extends AbstractXML2003Parser {
buffer.setLength(0);
}
if (success && !inPict) {
- handleEmbedded();
+ handleEmbedded(true);
}
}
}
- private void handleEmbedded() throws SAXException {
+ private void handleEmbedded(boolean outputHtml) throws SAXException {
if (rawBytes != null) {
try (TikaInputStream is = TikaInputStream.get(rawBytes)) {
Metadata metadata = new Metadata();
@@ -285,7 +285,7 @@ public class WordMLParser extends AbstractXML2003Parser {
metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, pictSource);
}
if (embeddedDocumentExtractor.shouldParseEmbedded(metadata)) {
- embeddedDocumentExtractor.parseEmbedded(is, handler, metadata, true);
+ embeddedDocumentExtractor.parseEmbedded(is, handler, metadata, outputHtml);
}
} catch (IOException e) {
//log
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index a35cb4893..93dfbd119 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -427,15 +427,16 @@ class AbstractPDF2XHTML extends PDFTextStripper {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
return;
}
+
+ attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+ attributes.addAttribute("", "id", "id", "CDATA", fileName);
+ xhtml.startElement("div", attributes);
+ xhtml.endElement("div");
+
try {
embeddedDocumentExtractor
.parseEmbedded(stream, new EmbeddedContentHandler(xhtml), embeddedMetadata,
- true);
-
- attributes.addAttribute("", "class", "class", "CDATA", "embedded");
- attributes.addAttribute("", "id", "id", "CDATA", fileName);
- xhtml.startElement("div", attributes);
- xhtml.endElement("div");
+ false);
} finally {
IOUtils.closeQuietly(stream);
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java
index 8515fb3c1..63c382558 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java
@@ -408,7 +408,7 @@ class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
try (InputStream embeddedIs = TikaInputStream.get(buffer.toByteArray())) {
embeddedDocumentExtractor
.parseEmbedded(embeddedIs, new EmbeddedContentHandler(xhtml), metadata,
- true);
+ false);
}
}
@@ -429,7 +429,7 @@ class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
parseContext.set(ZeroByteFileException.IgnoreZeroByteFileException.class,
ZeroByteFileException.IGNORE_ZERO_BYTE_FILE_EXCEPTION);
embeddedDocumentExtractor.parseEmbedded(TikaInputStream.get(new byte[0]),
- new EmbeddedContentHandler(xhtml), metadata, true);
+ new EmbeddedContentHandler(xhtml), metadata, false);
} finally {
//replace whatever was there before
parseContext.set(ZeroByteFileException.IgnoreZeroByteFileException.class, before);
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/XML2003ParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/XML2003ParserTest.java
index ec5d23015..986c08ce8 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/XML2003ParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/XML2003ParserTest.java
@@ -42,9 +42,7 @@ public class XML2003ParserTest extends TikaTest {
assertContains("<meta name=\"meta:character-count-with-spaces\" content=\"256\"", xml);
//do not allow nested <p> elements
assertContains(
- "<img href=\"02000003.jpg\" /><div class=\"package-entry\"><h1>02000003.jpg</h1> " +
- "</div> <p /> <p><img href=\"02000004.jpg\" />",
- xml);
+ "<img href=\"02000003.jpg\" />", xml);
assertContains("<table><tbody>", xml);
assertContains("</tbody></table>", xml);
assertContains("<td><p>R1 c1</p> </td>", xml);
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index fa7dc4cd6..b1c63cd72 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -16,19 +16,15 @@
*/
package org.apache.tika.parser.microsoft.ooxml;
-import java.io.InputStream;
import java.util.List;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.apache.tika.TikaTest;
-import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
import org.apache.tika.parser.microsoft.OfficeParserConfig;
public class OOXMLParserTest extends TikaTest {
@@ -70,26 +66,4 @@ public class OOXMLParserTest extends TikaTest {
//TIKA_2446
getRecursiveMetadata("testZIP_corrupted_oom.zip");
}
-
- @Test
- public void testConfiguringEmbeddedDocExtractor() throws Exception {
-
- TikaConfig tikaConfig = null;
- try (InputStream is = OOXMLParserTest.class.getResourceAsStream(
- "/configs/tika-config-no-names.xml")) {
- tikaConfig = new TikaConfig(is);
- }
- Parser p = new AutoDetectParser(tikaConfig);
- String xml = getXML("testPPT_EmbeddedPDF.pptx", p).xml;
- assertNotContained("<h1>/docProps/thumbnail.jpeg</h1>", xml);
-
- try (InputStream is = OOXMLParserTest.class.getResourceAsStream(
- "/configs/tika-config-with-names.xml")) {
- tikaConfig = new TikaConfig(is);
- }
- p = new AutoDetectParser(tikaConfig);
- xml = getXML("testPPT_EmbeddedPDF.pptx", p).xml;
- assertContains("<h1>/docProps/thumbnail.jpeg</h1>", xml);
- System.out.println(xml);
- }
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
index 9f9f71357..8ed307487 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
@@ -28,10 +28,14 @@ import java.util.Set;
import org.junit.jupiter.api.Test;
import org.xml.sax.ContentHandler;
+import org.apache.tika.config.TikaConfig;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.microsoft.ooxml.OOXMLParserTest;
import org.apache.tika.sax.BodyContentHandler;
/**
@@ -90,6 +94,27 @@ public class ZipParserTest extends AbstractPkgTest {
assertTrue(relIDs.allRelIDs.contains("test2.txt"));
}
+ @Test
+ public void testConfiguringEmbeddedDocExtractor() throws Exception {
+
+ TikaConfig tikaConfig = null;
+ try (InputStream is = OOXMLParserTest.class.getResourceAsStream(
+ "/configs/tika-config-no-names.xml")) {
+ tikaConfig = new TikaConfig(is);
+ }
+ Parser p = new AutoDetectParser(tikaConfig);
+ String xml = getXML("testEmbedded.zip", p).xml;
+ assertNotContained("<h1>image3.jpg</h1>", xml);
+
+ try (InputStream is = OOXMLParserTest.class.getResourceAsStream(
+ "/configs/tika-config-with-names.xml")) {
+ tikaConfig = new TikaConfig(is);
+ }
+ p = new AutoDetectParser(tikaConfig);
+ xml = getXML("testPPT_EmbeddedPDF.pptx", p).xml;
+ assertContains("<h1>image3.jpg</h1>", xml);
+ }
+
@Test
public void testZipEncrypted() throws Exception {
List<Metadata> metadataList = getRecursiveMetadata("testZipEncrypted.zip");