You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/04/08 19:19:51 UTC

[tika] 02/02: TIKA-3711 -- allow configuration of EmbeddedDocumentExtractors via tika-config.xml -- review and correct places where outputHtml should be false.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 6552b076f0b4987423710b72b8917150422ea112
Author: tallison <ta...@apache.org>
AuthorDate: Fri Apr 8 15:19:35 2022 -0400

    TIKA-3711 -- allow configuration of EmbeddedDocumentExtractors via tika-config.xml -- review and correct places where outputHtml should be false.
---
 CHANGES.txt                                        |  6 ++++
 .../parser/microsoft/AbstractPOIFSExtractor.java   | 33 ++++++++++++----------
 .../tika/parser/microsoft/ExcelExtractor.java      |  2 +-
 .../tika/parser/microsoft/HSLFExtractor.java       |  9 +++---
 .../tika/parser/microsoft/JackcessExtractor.java   |  2 +-
 .../tika/parser/microsoft/OutlookExtractor.java    |  2 +-
 .../tika/parser/microsoft/WordExtractor.java       |  2 +-
 .../microsoft/onenote/OneNoteTreeWalker.java       |  6 ++--
 .../microsoft/ooxml/AbstractOOXMLExtractor.java    |  2 +-
 .../parser/microsoft/pst/OutlookPSTParser.java     |  2 +-
 .../tika/parser/microsoft/xml/WordMLParser.java    |  8 +++---
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  | 13 +++++----
 .../tika/parser/pdf/ImageGraphicsEngine.java       |  4 +--
 .../tika/parser/microsoft/XML2003ParserTest.java   |  4 +--
 .../parser/microsoft/ooxml/OOXMLParserTest.java    | 26 -----------------
 .../org/apache/tika/parser/pkg/ZipParserTest.java  | 25 ++++++++++++++++
 16 files changed, 77 insertions(+), 69 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 7472cbed1..21203d8e4 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -18,6 +18,12 @@ Release 2.4.0 - ???
 
    * Add a fetcher and emitter for Azure blob storage (TIKA-3707).
 
+   * Allow configurability of the EmbeddedDocumentExtractor used
+     by the AutoDetectParser (TIKA-3711).
+
+   * Fixed regression in 2.3.0 that led to more embedded filenames
+     than appropriate being written to the content (TIKA-3711).
+
    * tika-server now clones forking process' environment variables
      into forked process (TIKA-3715).
 
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
index f5fc6d8f4..7a14733e8 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
@@ -143,16 +143,17 @@ abstract class AbstractPOIFSExtractor {
     /**
      * Handle an office document that's embedded at the POIFS level
      */
-    protected void handleEmbeddedOfficeDoc(DirectoryEntry dir, XHTMLContentHandler xhtml)
+    protected void handleEmbeddedOfficeDoc(DirectoryEntry dir, XHTMLContentHandler xhtml,
+                                           boolean outputHtml)
             throws IOException, SAXException, TikaException {
-        handleEmbeddedOfficeDoc(dir, null, xhtml);
+        handleEmbeddedOfficeDoc(dir, null, xhtml, outputHtml);
     }
 
     /**
      * Handle an office document that's embedded at the POIFS level
      */
     protected void handleEmbeddedOfficeDoc(DirectoryEntry dir, String resourceName,
-                                           XHTMLContentHandler xhtml)
+                                           XHTMLContentHandler xhtml, boolean outputHtml)
             throws IOException, SAXException, TikaException {
 
 
@@ -181,7 +182,7 @@ abstract class AbstractPOIFSExtractor {
                     return;
                 }
                 handleEmbeddedResource(stream, metadata,null, dir.getName(), dir.getStorageClsid(),
-                        type.toString(), xhtml, true);
+                        type.toString(), xhtml, outputHtml);
                 return;
             }
         }
@@ -198,19 +199,19 @@ abstract class AbstractPOIFSExtractor {
         POIFSDocumentType type = POIFSDocumentType.detectType(dir);
         String rName = (resourceName == null) ? dir.getName() : resourceName;
         if (type == POIFSDocumentType.OLE10_NATIVE) {
-            handleOLENative(dir, type, rName, metadata, xhtml);
+            handleOLENative(dir, type, rName, metadata, xhtml, outputHtml);
         } else if (type == POIFSDocumentType.COMP_OBJ) {
-            handleCompObj(dir, type, rName, metadata, xhtml);
+            handleCompObj(dir, type, rName, metadata, xhtml, outputHtml);
         } else {
             metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
             metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
                     rName + '.' + type.getExtension());
-            parseEmbedded(dir, xhtml, metadata);
+            parseEmbedded(dir, xhtml, metadata, outputHtml);
         }
     }
 
     private void handleCompObj(DirectoryEntry dir, POIFSDocumentType type, String rName,
-                               Metadata metadata, XHTMLContentHandler xhtml)
+                               Metadata metadata, XHTMLContentHandler xhtml, boolean outputHtml)
             throws IOException, SAXException {
         //TODO: figure out if the equivalent of OLE 1.0's
         //getCommand() and getFileName() exist for OLE 2.0 to populate
@@ -253,7 +254,7 @@ abstract class AbstractPOIFSExtractor {
             metadata.set(Metadata.CONTENT_TYPE, mediaType.getType());
             metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, rName + extension);
             metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(length));
-            parseEmbedded(dir, tis, xhtml, metadata);
+            parseEmbedded(dir, tis, xhtml, metadata, outputHtml);
         } finally {
             inp.close();
         }
@@ -261,7 +262,7 @@ abstract class AbstractPOIFSExtractor {
 
 
     private void handleOLENative(DirectoryEntry dir, POIFSDocumentType type, String rName,
-                                 Metadata metadata, XHTMLContentHandler xhtml)
+                                 Metadata metadata, XHTMLContentHandler xhtml, boolean outputHtml)
             throws IOException, SAXException {
         byte[] data = null;
         try {
@@ -289,12 +290,13 @@ abstract class AbstractPOIFSExtractor {
             return;
         }
         try (TikaInputStream tis = TikaInputStream.get(data)) {
-            parseEmbedded(dir, tis, xhtml, metadata);
+            parseEmbedded(dir, tis, xhtml, metadata, outputHtml);
         }
     }
 
     private void parseEmbedded(DirectoryEntry dir, TikaInputStream tis, XHTMLContentHandler xhtml,
-                               Metadata metadata) throws IOException, SAXException {
+                               Metadata metadata, boolean outputHtml) throws IOException,
+            SAXException {
         if (!embeddedDocumentUtil.shouldParseEmbedded(metadata)) {
             return;
         }
@@ -302,10 +304,11 @@ abstract class AbstractPOIFSExtractor {
             metadata.set(TikaCoreProperties.EMBEDDED_STORAGE_CLASS_ID,
                     dir.getStorageClsid().toString());
         }
-        embeddedDocumentUtil.parseEmbedded(tis, xhtml, metadata, true);
+        embeddedDocumentUtil.parseEmbedded(tis, xhtml, metadata, outputHtml);
     }
 
-    private void parseEmbedded(DirectoryEntry dir, XHTMLContentHandler xhtml, Metadata metadata)
+    private void parseEmbedded(DirectoryEntry dir, XHTMLContentHandler xhtml, Metadata metadata,
+                               boolean outputHtml)
             throws IOException, SAXException {
         if (!embeddedDocumentUtil.shouldParseEmbedded(metadata)) {
             return;
@@ -316,7 +319,7 @@ abstract class AbstractPOIFSExtractor {
                 metadata.set(TikaCoreProperties.EMBEDDED_STORAGE_CLASS_ID,
                         dir.getStorageClsid().toString());
             }
-            embeddedDocumentUtil.parseEmbedded(tis, xhtml, metadata, true);
+            embeddedDocumentUtil.parseEmbedded(tis, xhtml, metadata, outputHtml);
         }
     }
 }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
index f9e0cca0f..5bbc5257d 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
@@ -192,7 +192,7 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
         for (Entry entry : root) {
             if (entry.getName().startsWith("MBD") && entry instanceof DirectoryEntry) {
                 try {
-                    handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
+                    handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml, true);
                 } catch (TikaException e) {
                     // ignore parse errors from embedded documents
                 }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
index addc1825d..8c4a53059 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
@@ -155,7 +155,7 @@ public class HSLFExtractor extends AbstractPOIFSExtractor {
             }
 
             handleSlideEmbeddedPictures(ss, xhtml);
-            handleShowEmbeddedResources(ss, xhtml);
+            handleShowEmbeddedResources(ss, xhtml, true);
 
             if (officeParserConfig.isExtractMacros()) {
                 extractMacros(ss, xhtml);
@@ -172,7 +172,8 @@ public class HSLFExtractor extends AbstractPOIFSExtractor {
      * them in the shapes in the slides, headers/footers, etc, try to
      * extract them here.
      **/
-    private void handleShowEmbeddedResources(HSLFSlideShow ss, XHTMLContentHandler xhtml)
+    private void handleShowEmbeddedResources(HSLFSlideShow ss, XHTMLContentHandler xhtml,
+                                             boolean outputHtml)
             throws SAXException {
         
         HSLFObjectData[] objectData = ss.getEmbeddedObjects();
@@ -193,7 +194,7 @@ public class HSLFExtractor extends AbstractPOIFSExtractor {
                         if (pfs.getRoot().getEntryNames().size() < 1) {
                             return;
                         }
-                        handleEmbeddedOfficeDoc(pfs.getRoot(), filename, xhtml);
+                        handleEmbeddedOfficeDoc(pfs.getRoot(), filename, xhtml, outputHtml);
                     }
                 } else {
                     boolean shouldProcess = false;
@@ -582,7 +583,7 @@ public class HSLFExtractor extends AbstractPOIFSExtractor {
                                 throw new IOException(e);
                             }
                             try {
-                                handleEmbeddedOfficeDoc(poifs.getRoot(), objID, xhtml);
+                                handleEmbeddedOfficeDoc(poifs.getRoot(), objID, xhtml, false);
                             } finally {
                                 if (poifs != null) {
                                     poifs.close();
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
index a6d54edb3..a224fbca3 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
@@ -386,7 +386,7 @@ class JackcessExtractor extends AbstractPOIFSExtractor {
                 return;
             }
 
-            handleEmbeddedOfficeDoc(fileSystem.getRoot(), xhtml);
+            handleEmbeddedOfficeDoc(fileSystem.getRoot(), xhtml, true);
 
         } finally {
             if (fileSystem != null) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index ef7f16f23..a7983d7b4 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -286,7 +286,7 @@ public class OutlookExtractor extends AbstractPOIFSExtractor {
                 }
                 if (attachment.getAttachmentDirectory() != null) {
                     handleEmbeddedOfficeDoc(attachment.getAttachmentDirectory().getDirectory(),
-                            xhtml);
+                            xhtml, true);
                 }
 
                 xhtml.endElement("div");
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
index 781607359..95bb4a20e 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
@@ -221,7 +221,7 @@ public class WordExtractor extends AbstractPOIFSExtractor {
             DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool");
             for (Entry entry : op) {
                 if (entry.getName().startsWith("_") && entry instanceof DirectoryEntry) {
-                    handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
+                    handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml, true);
                 }
             }
 
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java
index 3dbe8e780..f5738bb19 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java
@@ -345,13 +345,13 @@ class OneNoteTreeWalker {
         }
         Metadata embeddedMetadata = new Metadata();
         try {
-            stream = TikaInputStream.get(buf.array());
-            embeddedDocumentExtractor.parseEmbedded(stream, new EmbeddedContentHandler(xhtml),
-                    embeddedMetadata, true);
             AttributesImpl attributes = new AttributesImpl();
             attributes.addAttribute("", "class", "class", "CDATA", "embedded");
             xhtml.startElement("div", attributes);
             xhtml.endElement("div");
+            stream = TikaInputStream.get(buf.array());
+            embeddedDocumentExtractor.parseEmbedded(stream, new EmbeddedContentHandler(xhtml),
+                    embeddedMetadata, false);
         } finally {
             IOUtils.closeQuietly(stream);
         }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index b922128ba..5a85d02cb 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -184,7 +184,7 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
 
                 if (embeddedExtractor.shouldParseEmbedded(thumbnailMetadata)) {
                     embeddedExtractor.parseEmbedded(TikaInputStream.get(tStream),
-                            new EmbeddedContentHandler(handler), thumbnailMetadata, true);
+                            new EmbeddedContentHandler(handler), thumbnailMetadata, false);
                 }
 
                 tStream.close();
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java
index 9beffc246..8379d671a 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java
@@ -273,7 +273,7 @@ public class OutlookPSTParser extends AbstractParser {
                     }
 
                     try {
-                        embeddedExtractor.parseEmbedded(tis, xhtml, attachMeta, true);
+                        embeddedExtractor.parseEmbedded(tis, xhtml, attachMeta, false);
                     } finally {
                         tis.close();
                     }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java
index 6b4851b0a..0590568d2 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java
@@ -255,7 +255,7 @@ public class WordMLParser extends AbstractXML2003Parser {
                 }
                 handler.startElement(XHTMLContentHandler.XHTML, IMG, IMG, attrs);
                 handler.endElement(XHTMLContentHandler.XHTML, IMG, IMG);
-                handleEmbedded();
+                handleEmbedded(false);
             } else if (BIN_DATA.equals(localName)) {
                 inBin = false;
                 boolean success = false;
@@ -269,12 +269,12 @@ public class WordMLParser extends AbstractXML2003Parser {
                     buffer.setLength(0);
                 }
                 if (success && !inPict) {
-                    handleEmbedded();
+                    handleEmbedded(true);
                 }
             }
         }
 
-        private void handleEmbedded() throws SAXException {
+        private void handleEmbedded(boolean outputHtml) throws SAXException {
             if (rawBytes != null) {
                 try (TikaInputStream is = TikaInputStream.get(rawBytes)) {
                     Metadata metadata = new Metadata();
@@ -285,7 +285,7 @@ public class WordMLParser extends AbstractXML2003Parser {
                         metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, pictSource);
                     }
                     if (embeddedDocumentExtractor.shouldParseEmbedded(metadata)) {
-                        embeddedDocumentExtractor.parseEmbedded(is, handler, metadata, true);
+                        embeddedDocumentExtractor.parseEmbedded(is, handler, metadata, outputHtml);
                     }
                 } catch (IOException e) {
                     //log
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index a35cb4893..93dfbd119 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -427,15 +427,16 @@ class AbstractPDF2XHTML extends PDFTextStripper {
             EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
             return;
         }
+
+        attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+        attributes.addAttribute("", "id", "id", "CDATA", fileName);
+        xhtml.startElement("div", attributes);
+        xhtml.endElement("div");
+
         try {
             embeddedDocumentExtractor
                     .parseEmbedded(stream, new EmbeddedContentHandler(xhtml), embeddedMetadata,
-                            true);
-
-            attributes.addAttribute("", "class", "class", "CDATA", "embedded");
-            attributes.addAttribute("", "id", "id", "CDATA", fileName);
-            xhtml.startElement("div", attributes);
-            xhtml.endElement("div");
+                            false);
         } finally {
             IOUtils.closeQuietly(stream);
         }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java
index 8515fb3c1..63c382558 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java
@@ -408,7 +408,7 @@ class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
             try (InputStream embeddedIs = TikaInputStream.get(buffer.toByteArray())) {
                 embeddedDocumentExtractor
                         .parseEmbedded(embeddedIs, new EmbeddedContentHandler(xhtml), metadata,
-                                true);
+                                false);
             }
         }
 
@@ -429,7 +429,7 @@ class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
             parseContext.set(ZeroByteFileException.IgnoreZeroByteFileException.class,
                     ZeroByteFileException.IGNORE_ZERO_BYTE_FILE_EXCEPTION);
             embeddedDocumentExtractor.parseEmbedded(TikaInputStream.get(new byte[0]),
-                    new EmbeddedContentHandler(xhtml), metadata, true);
+                    new EmbeddedContentHandler(xhtml), metadata, false);
         } finally {
             //replace whatever was there before
             parseContext.set(ZeroByteFileException.IgnoreZeroByteFileException.class, before);
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/XML2003ParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/XML2003ParserTest.java
index ec5d23015..986c08ce8 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/XML2003ParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/XML2003ParserTest.java
@@ -42,9 +42,7 @@ public class XML2003ParserTest extends TikaTest {
         assertContains("<meta name=\"meta:character-count-with-spaces\" content=\"256\"", xml);
         //do not allow nested <p> elements
         assertContains(
-                "<img href=\"02000003.jpg\" /><div class=\"package-entry\"><h1>02000003.jpg</h1> " +
-                        "</div> <p /> <p><img href=\"02000004.jpg\" />",
-                xml);
+                "<img href=\"02000003.jpg\" />", xml);
         assertContains("<table><tbody>", xml);
         assertContains("</tbody></table>", xml);
         assertContains("<td><p>R1 c1</p> </td>", xml);
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index fa7dc4cd6..b1c63cd72 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -16,19 +16,15 @@
  */
 package org.apache.tika.parser.microsoft.ooxml;
 
-import java.io.InputStream;
 import java.util.List;
 
 import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 
 import org.apache.tika.TikaTest;
-import org.apache.tika.config.TikaConfig;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.microsoft.OfficeParserConfig;
 
 public class OOXMLParserTest extends TikaTest {
@@ -70,26 +66,4 @@ public class OOXMLParserTest extends TikaTest {
         //TIKA_2446
         getRecursiveMetadata("testZIP_corrupted_oom.zip");
     }
-
-    @Test
-    public void testConfiguringEmbeddedDocExtractor() throws Exception {
-
-        TikaConfig tikaConfig = null;
-        try (InputStream is = OOXMLParserTest.class.getResourceAsStream(
-                "/configs/tika-config-no-names.xml")) {
-            tikaConfig = new TikaConfig(is);
-        }
-        Parser p = new AutoDetectParser(tikaConfig);
-        String xml = getXML("testPPT_EmbeddedPDF.pptx", p).xml;
-        assertNotContained("<h1>/docProps/thumbnail.jpeg</h1>", xml);
-
-        try (InputStream is = OOXMLParserTest.class.getResourceAsStream(
-                "/configs/tika-config-with-names.xml")) {
-            tikaConfig = new TikaConfig(is);
-        }
-        p = new AutoDetectParser(tikaConfig);
-        xml = getXML("testPPT_EmbeddedPDF.pptx", p).xml;
-        assertContains("<h1>/docProps/thumbnail.jpeg</h1>", xml);
-        System.out.println(xml);
-    }
 }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
index 9f9f71357..8ed307487 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
@@ -28,10 +28,14 @@ import java.util.Set;
 import org.junit.jupiter.api.Test;
 import org.xml.sax.ContentHandler;
 
+import org.apache.tika.config.TikaConfig;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.microsoft.ooxml.OOXMLParserTest;
 import org.apache.tika.sax.BodyContentHandler;
 
 /**
@@ -90,6 +94,27 @@ public class ZipParserTest extends AbstractPkgTest {
         assertTrue(relIDs.allRelIDs.contains("test2.txt"));
     }
 
+    @Test
+    public void testConfiguringEmbeddedDocExtractor() throws Exception {
+
+        TikaConfig tikaConfig = null;
+        try (InputStream is = OOXMLParserTest.class.getResourceAsStream(
+                "/configs/tika-config-no-names.xml")) {
+            tikaConfig = new TikaConfig(is);
+        }
+        Parser p = new AutoDetectParser(tikaConfig);
+        String xml = getXML("testEmbedded.zip", p).xml;
+        assertNotContained("<h1>image3.jpg</h1>", xml);
+
+        try (InputStream is = OOXMLParserTest.class.getResourceAsStream(
+                "/configs/tika-config-with-names.xml")) {
+            tikaConfig = new TikaConfig(is);
+        }
+        p = new AutoDetectParser(tikaConfig);
+        xml = getXML("testPPT_EmbeddedPDF.pptx", p).xml;
+        assertContains("<h1>image3.jpg</h1>", xml);
+    }
+
     @Test
     public void testZipEncrypted() throws Exception {
         List<Metadata> metadataList = getRecursiveMetadata("testZipEncrypted.zip");