You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/08/15 15:18:50 UTC

[tika] branch TIKA-4091 updated: TIKA-4091 -- small fixes to surprises from the initial commits for TIKA-4091

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4091
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/TIKA-4091 by this push:
     new c7860a408 TIKA-4091 -- small fixes to surprises from the initial commits for TIKA-4091
c7860a408 is described below

commit c7860a408a3e35cb1a247122c35405c1ee28d1ba
Author: tallison <ta...@apache.org>
AuthorDate: Tue Aug 15 11:18:33 2023 -0400

    TIKA-4091 -- small fixes to surprises from the initial commits for TIKA-4091
---
 .../parser/microsoft/AbstractPOIFSExtractor.java   | 42 +++++++++++++++-------
 1 file changed, 30 insertions(+), 12 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
index 45beee741..df4668eb8 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
@@ -260,21 +260,19 @@ abstract class AbstractPOIFSExtractor {
         //getCommand() and getFileName() exist for OLE 2.0 to populate
         //TikaCoreProperties.ORIGINAL_RESOURCE_NAME
 
+        String contentsEntryName = getContentsEntryName(dir);
+        if (contentsEntryName == null) {
+            //log or record exception?
+            return;
+        }
         // Grab the contents and process
         DocumentEntry contentsEntry;
-        /*if (dir.hasEntry("CorelDRAW")) {
-            contentsEntry = (DocumentEntry) dir.getEntry("CorelDRAW");}
-         */
-        //TODO: modify getEntry to case insensitive when available in POI
+
         try {
-            contentsEntry = (DocumentEntry) dir.getEntry("CONTENTS");
-        } catch (FileNotFoundException fnfe1) {
-            try {
-                contentsEntry = (DocumentEntry) dir.getEntry("Contents");
-            } catch (FileNotFoundException fnfe2) {
-                EmbeddedDocumentUtil.recordEmbeddedStreamException(fnfe2, parentMetadata);
-                return;
-            }
+            contentsEntry = (DocumentEntry) dir.getEntry(contentsEntryName);
+        } catch (FileNotFoundException fnfe) {
+            EmbeddedDocumentUtil.recordEmbeddedStreamException(fnfe, parentMetadata);
+            return;
         }
 
         int length = contentsEntry.getSize();
@@ -308,6 +306,26 @@ abstract class AbstractPOIFSExtractor {
         }
     }
 
+    private String getContentsEntryName(DirectoryEntry dir) {
+        /*
+        if (dir.hasEntry("CorelDRAW")) {
+            contentsEntry = (DocumentEntry) dir.getEntry("CorelDRAW");}
+         */
+        //TODO: modify getEntry to case insensitive when available in POI
+        if (dir.hasEntry("CONTENTS")) {
+            return "CONTENTS";
+        } else if (dir.hasEntry("Contents")) {
+            return "Contents";
+        } else {
+            for (String n : dir.getEntryNames()) {
+                if ("contents".equalsIgnoreCase(n)) {
+                    return n;
+                }
+            }
+        }
+        return null;
+    }
+
 
     private void handleOLENative(DirectoryEntry dir, POIFSDocumentType type, String rName,
                                  Metadata metadata, XHTMLContentHandler xhtml, boolean outputHtml)