You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/08/15 16:02:11 UTC
[tika] branch main updated: TIKA-4116 (#1285)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new b968dbca8 TIKA-4116 (#1285)
b968dbca8 is described below

commit b968dbca8929d4f09113285bb5a1609cc5088eb4
Author: Tim Allison <ta...@apache.org>
AuthorDate: Tue Aug 15 12:02:06 2023 -0400

    TIKA-4116 (#1285)
    
    * TIKA-4116 -- don't extract macros from directory nodes
---
 CHANGES.txt                                                    |  2 ++
 .../java/org/apache/tika/parser/microsoft/OfficeParser.java    | 10 ++++++++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 85d9e0dad..55bd83671 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -5,6 +5,8 @@ Release 2.8.1 - ???
      the PDFParser now throws an EncryptedDocumentException instead of an IOException
      if the security handler cannot be found (TIKA-4082).
 
+   * Fix bug that led to duplicate extraction of macros from some OLE2 containers (TIKA-4116).
+
    * Changed default decompressConcatenated to true in CompressorParser.
      Users may revert to legacy behavior via tika-config.xml (TIKA-4048).
 
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
index c3c81e792..c082b30d0 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
@@ -155,6 +155,7 @@ public class OfficeParser extends AbstractOfficeParser {
         final DirectoryNode root;
         TikaInputStream tstream = TikaInputStream.cast(stream);
         POIFSFileSystem mustCloseFs = null;
+        boolean isDirectoryNode = false;
         try {
             if (tstream == null) {
                 mustCloseFs = new POIFSFileSystem(CloseShieldInputStream.wrap(stream));
@@ -165,6 +166,7 @@ public class OfficeParser extends AbstractOfficeParser {
                     root = ((POIFSFileSystem) container).getRoot();
                 } else if (container instanceof DirectoryNode) {
                     root = (DirectoryNode) container;
+                    isDirectoryNode = true;
                 } else {
                     POIFSFileSystem fs = null;
                     if (tstream.hasFile()) {
@@ -187,8 +189,12 @@ public class OfficeParser extends AbstractOfficeParser {
 
                 //We might consider not bothering to check for macros in root,
                 //if we know we're processing ppt based on content-type identified in metadata
-                extractMacros(root.getFileSystem(), xhtml,
-                        EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context));
+                if (! isDirectoryNode) {
+                    // if the "root" is a directory node, we assume that the macros have already
+                    // been extracted from the parent's fileSystem -- TIKA-4116
+                    extractMacros(root.getFileSystem(), xhtml,
+                            EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context));
+                }
 
             }
         } finally {