You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/08/15 15:20:22 UTC
[tika] 01/01: TIKA-4116 -- don't extract macros from directory nodes
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-4116
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 40ace13e67e70428b21871d55633c317a32f3074
Author: tallison <ta...@apache.org>
AuthorDate: Tue Aug 15 11:20:12 2023 -0400
TIKA-4116 -- don't extract macros from directory nodes
---
.../main/java/org/apache/tika/parser/microsoft/OfficeParser.java | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
index c3c81e792..c787bf66d 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
@@ -155,6 +155,7 @@ public class OfficeParser extends AbstractOfficeParser {
final DirectoryNode root;
TikaInputStream tstream = TikaInputStream.cast(stream);
POIFSFileSystem mustCloseFs = null;
+ boolean isDirectoryNode = false;
try {
if (tstream == null) {
mustCloseFs = new POIFSFileSystem(CloseShieldInputStream.wrap(stream));
@@ -165,6 +166,7 @@ public class OfficeParser extends AbstractOfficeParser {
root = ((POIFSFileSystem) container).getRoot();
} else if (container instanceof DirectoryNode) {
root = (DirectoryNode) container;
+ isDirectoryNode = true;
} else {
POIFSFileSystem fs = null;
if (tstream.hasFile()) {
@@ -187,8 +189,11 @@ public class OfficeParser extends AbstractOfficeParser {
//We might consider not bothering to check for macros in root,
//if we know we're processing ppt based on content-type identified in metadata
- extractMacros(root.getFileSystem(), xhtml,
- EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context));
+ if (! isDirectoryNode) {
+ // if the "root" is a directory node, we assume that the macros have already
+ // been extracted from the parent's fileSystem -- TIKA-4116
+ extractMacros(root.getFileSystem(), xhtml, EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context));
+ }
}
} finally {