You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/08/15 15:54:36 UTC

[tika] branch main updated: TIKA-4091 -- further refinements (#1286)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new c3a9dc4ea TIKA-4091 -- further refinements (#1286)
c3a9dc4ea is described below

commit c3a9dc4eae3e87b6ba77ab2c931d71f59c02e19f
Author: Tim Allison <ta...@apache.org>
AuthorDate: Tue Aug 15 11:54:31 2023 -0400

    TIKA-4091 -- further refinements (#1286)
    
    * TIKA-4091 -- small fixes to surprises from the initial commits for TIKA-4091
---
 .../main/java/org/apache/tika/metadata/Office.java |  2 +
 .../detect/microsoft/POIFSContainerDetector.java   |  6 ++
 .../parser/microsoft/AbstractPOIFSExtractor.java   | 78 ++++++++++++++++++----
 3 files changed, 74 insertions(+), 12 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Office.java b/tika-core/src/main/java/org/apache/tika/metadata/Office.java
index aff57f701..815f060c1 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Office.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Office.java
@@ -174,4 +174,6 @@ public interface Office {
      * Word.Document.12 or AcroExch.Document.DC
      */
     Property PROG_ID = Property.internalText("msoffice:progID");
+
+    Property OCX_NAME = Property.internalText("msoffice:ocxName");
 }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
index 23648c884..b67638f8d 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
@@ -91,6 +91,8 @@ public class POIFSContainerDetector implements Detector {
      */
     public static final MediaType MS_EQUATION = application("vnd.ms-equation");
 
+    public static final String OCX_NAME = "\u0003OCXNAME";
+
     /**
      * Microsoft Excel
      */
@@ -343,6 +345,10 @@ public class POIFSContainerDetector implements Detector {
             return WPS;
         } else if (ucNames.contains(EQUATION_NATIVE)) {
             return MS_EQUATION;
+        } else if (ucNames.contains(OCX_NAME)) {
+            //active x control should be parsed as OLE, not COMP_OBJ -- TIKA-4091
+            //TODO -- create a mime for active x
+            return OLE;
         } else if (ucNames.contains(CONTENTS) && ucNames.contains(OBJ_INFO)) {
             return COMP_OBJ;
         } else if (ucNames.contains(CONTENTS) && ucNames.contains(COMP_OBJ_STRING)) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
index fa464b902..df4668eb8 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
@@ -19,6 +19,7 @@ package org.apache.tika.parser.microsoft;
 import java.io.FileNotFoundException;
 import java.io.IOException;
 
+import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
 import org.apache.poi.hpsf.ClassID;
 import org.apache.poi.poifs.filesystem.DirectoryEntry;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
@@ -27,6 +28,8 @@ import org.apache.poi.poifs.filesystem.DocumentInputStream;
 import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.Ole10Native;
 import org.apache.poi.poifs.filesystem.Ole10NativeException;
+import org.apache.poi.util.IOUtils;
+import org.apache.poi.util.StringUtil;
 import org.xml.sax.SAXException;
 
 import org.apache.tika.config.TikaConfig;
@@ -36,6 +39,7 @@ import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.mime.MimeType;
@@ -48,6 +52,8 @@ import org.apache.tika.sax.XHTMLContentHandler;
 import org.apache.tika.utils.StringUtils;
 
 abstract class AbstractPOIFSExtractor {
+
+    private static final String OCX_NAME = "\u0003OCXNAME";
     protected final Metadata parentMetadata;//metadata of the parent/container document
     protected final OfficeParserConfig officeParserConfig;
     protected final ParseContext context;
@@ -199,6 +205,7 @@ abstract class AbstractPOIFSExtractor {
         }
         POIFSDocumentType type = POIFSDocumentType.detectType(dir);
         String rName = (resourceName == null) ? dir.getName() : resourceName;
+        extractOCXName(dir, metadata);
         if (type == POIFSDocumentType.OLE10_NATIVE) {
             handleOLENative(dir, type, rName, metadata, xhtml, outputHtml);
         } else if (type == POIFSDocumentType.COMP_OBJ) {
@@ -217,6 +224,35 @@ abstract class AbstractPOIFSExtractor {
         }
     }
 
+    private void extractOCXName(DirectoryEntry dir, Metadata metadata) {
+        if (! dir.hasEntry(OCX_NAME)) {
+            return;
+        }
+        try {
+            Entry e = dir.getEntry(OCX_NAME);
+            if (!e.isDocumentEntry()) {
+                return;
+            }
+            UnsynchronizedByteArrayOutputStream bos = new UnsynchronizedByteArrayOutputStream();
+            try (DocumentInputStream dis = new DocumentInputStream((DocumentEntry) e)) {
+                IOUtils.copy(dis, bos);
+            }
+            byte[] bytes = bos.toByteArray();
+            int charCount = (bytes.length - 4);
+            if (charCount < 0) {
+                return;
+            }
+            if (charCount % 2 != 0) {
+                return;
+            }
+            charCount /= 2;
+            String ocxName = StringUtil.getFromUnicodeLE0Terminated(bytes, 0, charCount);
+            metadata.set(Office.OCX_NAME, ocxName);
+        } catch (IOException e) {
+            //log this?
+        }
+    }
+
     private void handleCompObj(DirectoryEntry dir, POIFSDocumentType type, String rName,
                                Metadata metadata, XHTMLContentHandler xhtml, boolean outputHtml)
             throws IOException, SAXException {
@@ -224,21 +260,19 @@ abstract class AbstractPOIFSExtractor {
         //getCommand() and getFileName() exist for OLE 2.0 to populate
         //TikaCoreProperties.ORIGINAL_RESOURCE_NAME
 
+        String contentsEntryName = getContentsEntryName(dir);
+        if (contentsEntryName == null) {
+            //log or record exception?
+            return;
+        }
         // Grab the contents and process
         DocumentEntry contentsEntry;
-        /*if (dir.hasEntry("CorelDRAW")) {
-            contentsEntry = (DocumentEntry) dir.getEntry("CorelDRAW");}
-         */
-        //TODO: modify getEntry to case insensitive when available in POI
+
         try {
-            contentsEntry = (DocumentEntry) dir.getEntry("CONTENTS");
-        } catch (FileNotFoundException fnfe1) {
-            try {
-                contentsEntry = (DocumentEntry) dir.getEntry("Contents");
-            } catch (FileNotFoundException fnfe2) {
-                EmbeddedDocumentUtil.recordEmbeddedStreamException(fnfe2, parentMetadata);
-                return;
-            }
+            contentsEntry = (DocumentEntry) dir.getEntry(contentsEntryName);
+        } catch (FileNotFoundException fnfe) {
+            EmbeddedDocumentUtil.recordEmbeddedStreamException(fnfe, parentMetadata);
+            return;
         }
 
         int length = contentsEntry.getSize();
@@ -272,6 +306,26 @@ abstract class AbstractPOIFSExtractor {
         }
     }
 
+    private String getContentsEntryName(DirectoryEntry dir) {
+        /*
+        if (dir.hasEntry("CorelDRAW")) {
+            contentsEntry = (DocumentEntry) dir.getEntry("CorelDRAW");}
+         */
+        //TODO: modify getEntry to case insensitive when available in POI
+        if (dir.hasEntry("CONTENTS")) {
+            return "CONTENTS";
+        } else if (dir.hasEntry("Contents")) {
+            return "Contents";
+        } else {
+            for (String n : dir.getEntryNames()) {
+                if ("contents".equalsIgnoreCase(n)) {
+                    return n;
+                }
+            }
+        }
+        return null;
+    }
+
 
     private void handleOLENative(DirectoryEntry dir, POIFSDocumentType type, String rName,
                                  Metadata metadata, XHTMLContentHandler xhtml, boolean outputHtml)