You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/08/15 15:20:22 UTC

[tika] 01/01: TIKA-4116 -- don't extract macros from directory nodes

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4116
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 40ace13e67e70428b21871d55633c317a32f3074
Author: tallison <ta...@apache.org>
AuthorDate: Tue Aug 15 11:20:12 2023 -0400

    TIKA-4116 -- don't extract macros from directory nodes
---
 .../main/java/org/apache/tika/parser/microsoft/OfficeParser.java | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
index c3c81e792..c787bf66d 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
@@ -155,6 +155,7 @@ public class OfficeParser extends AbstractOfficeParser {
         final DirectoryNode root;
         TikaInputStream tstream = TikaInputStream.cast(stream);
         POIFSFileSystem mustCloseFs = null;
+        boolean isDirectoryNode = false;
         try {
             if (tstream == null) {
                 mustCloseFs = new POIFSFileSystem(CloseShieldInputStream.wrap(stream));
@@ -165,6 +166,7 @@ public class OfficeParser extends AbstractOfficeParser {
                     root = ((POIFSFileSystem) container).getRoot();
                 } else if (container instanceof DirectoryNode) {
                     root = (DirectoryNode) container;
+                    isDirectoryNode = true;
                 } else {
                     POIFSFileSystem fs = null;
                     if (tstream.hasFile()) {
@@ -187,8 +189,11 @@ public class OfficeParser extends AbstractOfficeParser {
 
                 //We might consider not bothering to check for macros in root,
                 //if we know we're processing ppt based on content-type identified in metadata
-                extractMacros(root.getFileSystem(), xhtml,
-                        EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context));
+                if (! isDirectoryNode) {
+                    // if the "root" is a directory node, we assume that the macros have already
+                    // been extracted from the parent's fileSystem -- TIKA-4116
+                    extractMacros(root.getFileSystem(), xhtml, EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context));
+                }
 
             }
         } finally {