You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@tika.apache.org by ta...@apache.org on 2023/08/15 15:20:21 UTC

[tika] branch TIKA-4116 created (now 40ace13e6)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch TIKA-4116
in repository https://gitbox.apache.org/repos/asf/tika.git


      at 40ace13e6 TIKA-4116 -- don't extract macros from directory nodes

This branch includes the following new commits:

     new 40ace13e6 TIKA-4116 -- don't extract macros from directory nodes

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.

[tika] 01/01: TIKA-4116 -- don't extract macros from directory nodes

Posted by ta...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4116
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 40ace13e67e70428b21871d55633c317a32f3074
Author: tallison <ta...@apache.org>
AuthorDate: Tue Aug 15 11:20:12 2023 -0400

    TIKA-4116 -- don't extract macros from directory nodes
---
 .../main/java/org/apache/tika/parser/microsoft/OfficeParser.java | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
index c3c81e792..c787bf66d 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
@@ -155,6 +155,7 @@ public class OfficeParser extends AbstractOfficeParser {
         final DirectoryNode root;
         TikaInputStream tstream = TikaInputStream.cast(stream);
         POIFSFileSystem mustCloseFs = null;
+        boolean isDirectoryNode = false;
         try {
             if (tstream == null) {
                 mustCloseFs = new POIFSFileSystem(CloseShieldInputStream.wrap(stream));
@@ -165,6 +166,7 @@ public class OfficeParser extends AbstractOfficeParser {
                     root = ((POIFSFileSystem) container).getRoot();
                 } else if (container instanceof DirectoryNode) {
                     root = (DirectoryNode) container;
+                    isDirectoryNode = true;
                 } else {
                     POIFSFileSystem fs = null;
                     if (tstream.hasFile()) {
@@ -187,8 +189,11 @@ public class OfficeParser extends AbstractOfficeParser {
 
                 //We might consider not bothering to check for macros in root,
                 //if we know we're processing ppt based on content-type identified in metadata
-                extractMacros(root.getFileSystem(), xhtml,
-                        EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context));
+                if (! isDirectoryNode) {
+                    // if the "root" is a directory node, we assume that the macros have already
+                    // been extracted from the parent's fileSystem -- TIKA-4116
+                    extractMacros(root.getFileSystem(), xhtml, EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context));
+                }
 
             }
         } finally {