You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/08/12 19:48:04 UTC

[tika] 03/03: TIKA-3161 -- extract macros from open document formats

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 611417f64e3d1848f0903951f624cc65a0aa5e27
Author: tallison <ta...@apache.org>
AuthorDate: Wed Aug 12 15:47:41 2020 -0400

    TIKA-3161 -- extract macros from open document formats
---
 CHANGES.txt                                        |  22 +++++++
 ...dler.java => FlatOpenDocumentMacroHandler.java} |  15 +++--
 .../tika/parser/odf/FlatOpenDocumentParser.java    |   4 +-
 .../tika/parser/odf/OpenDocumentMacroHandler.java  |  62 ++----------------
 .../apache/tika/parser/odf/OpenDocumentParser.java |  49 ++++++++++++++-
 .../org/apache/tika/parser/odf/ODFParserTest.java  |  69 +++++++++++++++++++++
 .../test/resources/test-documents/testODPMacro.odp | Bin 0 -> 14505 bytes
 .../test/resources/test-documents/testODSMacro.ods | Bin 0 -> 30726 bytes
 .../test/resources/test-documents/testODTMacro.odt | Bin 0 -> 29912 bytes
 9 files changed, 154 insertions(+), 67 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 66122b3..1f584d9 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -7,6 +7,28 @@ Release 2.0.0 - ???
 
 Release 1.25 - ???
 
+   * Add detection and a parser for flat ODF files (TIKA-3159).
+
+   * Add extraction of macros from ODF files (TIKA-3161).
+
+   * Add mime detection for hprof and hprof text files (TIKA-3144).
+
+   * Add TextSignature and TextProfileSignature to tika-eval (TIKA-3145 and TIKA-3146)
+
+   * Create a metadata filter to trigger tika-eval stats post parsing (TIKA-3140)
+
+   * Add a configurable metadata-filter for the RecursiveParserWrapper (TIKA-3137)
+
+   * Add status endpoint to tika-server (TIKA-3129).
+
+   * Remove whitelist/blacklist terminology (TIKA-3120)
+
+   * Add detection for parquet files (TIKA-3115).
+
+   * Add detection and parsing for bplist (TIKA-3104).
+
+   * Enable metadata value filtering
+
    * Add a basic parser for plist files based on com.googlecode.plist:dd-plist (TIKA-3104).
 
 Release 1.24.1 - 4/17/2020
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMacroHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentMacroHandler.java
similarity index 90%
copy from tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMacroHandler.java
copy to tika-parsers/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentMacroHandler.java
index 14e4d8d..fefc824 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMacroHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentMacroHandler.java
@@ -34,20 +34,20 @@ import java.io.InputStream;
 import java.nio.charset.StandardCharsets;
 
 
-class OpenDocumentMacroHandler extends ContentHandlerDecorator {
+class FlatOpenDocumentMacroHandler extends ContentHandlerDecorator {
 
-    private static String MODULE = "module";
+    static String MODULE = "module";
     private static String SOURCE_CODE = "source-code";
-    private static String NAME = "name";
+    static String NAME = "name";
 
     private final ContentHandler contentHandler;
     private final ParseContext parseContext;
     private EmbeddedDocumentExtractor embeddedDocumentExtractor;
     private final StringBuilder macroBuffer = new StringBuilder();
-    private String macroName = null;
-    private boolean inMacro = false;
+    String macroName = null;
+    boolean inMacro = false;
 
-    OpenDocumentMacroHandler(ContentHandler contentHandler, ParseContext parseContext) {
+    FlatOpenDocumentMacroHandler(ContentHandler contentHandler, ParseContext parseContext) {
         super(contentHandler);
         this.contentHandler = contentHandler;
         this.parseContext = parseContext;
@@ -84,7 +84,7 @@ class OpenDocumentMacroHandler extends ContentHandlerDecorator {
         }
     }
 
-    private void handleMacro() throws IOException, SAXException {
+    protected void handleMacro() throws IOException, SAXException {
 
         byte[] bytes = macroBuffer.toString().getBytes(StandardCharsets.UTF_8);
 
@@ -110,6 +110,5 @@ class OpenDocumentMacroHandler extends ContentHandlerDecorator {
                 );
             }
         }
-
     }
 }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentParser.java
index 442840c..04c7cd5 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentParser.java
@@ -41,6 +41,8 @@ import java.util.Set;
 
 public class FlatOpenDocumentParser extends AbstractParser {
 
+    private static final long serialVersionUID = -8739250869531737584L;
+
     static final MediaType FLAT_OD = MediaType.application("vnd.oasis.opendocument.tika.flat.document");
 
     static final MediaType FLAT_ODT = MediaType.application("vnd.oasis.opendocument.flat.text");
@@ -115,7 +117,7 @@ public class FlatOpenDocumentParser extends AbstractParser {
             this.parseContext = parseContext;
             this.bodyHandler = new OpenDocumentBodyHandler(new NSNormalizerContentHandler(baseHandler), parseContext);
             this.metadataHandler = OpenDocumentMetaParser.getContentHandler(metadata, parseContext);
-            this.macroHandler = new OpenDocumentMacroHandler(baseHandler, parseContext);
+            this.macroHandler = new FlatOpenDocumentMacroHandler(baseHandler, parseContext);
         }
 
         MediaType getDetectedType() {
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMacroHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMacroHandler.java
index 14e4d8d..79c11ab 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMacroHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMacroHandler.java
@@ -34,48 +34,25 @@ import java.io.InputStream;
 import java.nio.charset.StandardCharsets;
 
 
-class OpenDocumentMacroHandler extends ContentHandlerDecorator {
-
-    private static String MODULE = "module";
-    private static String SOURCE_CODE = "source-code";
-    private static String NAME = "name";
-
-    private final ContentHandler contentHandler;
-    private final ParseContext parseContext;
-    private EmbeddedDocumentExtractor embeddedDocumentExtractor;
-    private final StringBuilder macroBuffer = new StringBuilder();
-    private String macroName = null;
-    private boolean inMacro = false;
+class OpenDocumentMacroHandler extends FlatOpenDocumentMacroHandler {
 
     OpenDocumentMacroHandler(ContentHandler contentHandler, ParseContext parseContext) {
-        super(contentHandler);
-        this.contentHandler = contentHandler;
-        this.parseContext = parseContext;
+        super(contentHandler, parseContext);
     }
 
     @Override
     public void startElement(
             String namespaceURI, String localName, String qName,
             Attributes attrs) throws SAXException {
-        if (MODULE.equals(localName)) {
-            macroName = XMLReaderUtils.getAttrValue(NAME, attrs);
-        } else if (SOURCE_CODE.equals(localName)) {
-            inMacro = true;
-        }
+        inMacro = true;
+        macroName = XMLReaderUtils.getAttrValue(NAME, attrs);
     }
 
-    @Override
-    public void characters(char[] ch, int start, int length)
-            throws SAXException {
-        if (inMacro) {
-            macroBuffer.append(ch, start, length);
-        }
-    }
 
     @Override
     public void endElement(
             String namespaceURI, String localName, String qName) throws SAXException {
-        if (SOURCE_CODE.equals(localName)) {
+        if (MODULE.equals(localName)) {
             try {
                 handleMacro();
             } catch (IOException e) {
@@ -83,33 +60,4 @@ class OpenDocumentMacroHandler extends ContentHandlerDecorator {
             }
         }
     }
-
-    private void handleMacro() throws IOException, SAXException {
-
-        byte[] bytes = macroBuffer.toString().getBytes(StandardCharsets.UTF_8);
-
-        if (embeddedDocumentExtractor == null) {
-            embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(parseContext);
-        }
-        Metadata embeddedMetadata = new Metadata();
-        if (! StringUtils.isBlank(macroName)) {
-            embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, macroName);
-        }
-        embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
-                TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
-
-        //reset state before parse
-        macroBuffer.setLength(0);
-        macroName = null;
-        inMacro = false;
-
-        if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
-            try (InputStream is = TikaInputStream.get(bytes)) {
-                embeddedDocumentExtractor.parseEmbedded(
-                        is, contentHandler, embeddedMetadata, false
-                );
-            }
-        }
-
-    }
 }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
index 86ac3cf..a750a9b 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
@@ -30,6 +30,8 @@ import java.util.zip.ZipFile;
 import java.util.zip.ZipInputStream;
 
 import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.commons.lang3.StringUtils;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
@@ -42,7 +44,9 @@ import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.EmbeddedContentHandler;
 import org.apache.tika.sax.EndDocumentShieldingContentHandler;
+import org.apache.tika.sax.OfflineContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.XMLReaderUtils;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
@@ -204,7 +208,7 @@ public class OpenDocumentParser extends AbstractParser {
         }
     }
     private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata,
-                                ParseContext context, EndDocumentShieldingContentHandler handler)
+                                ParseContext context, ContentHandler handler)
             throws IOException, SAXException, TikaException {
         if (entry == null) return;
 
@@ -232,6 +236,10 @@ public class OpenDocumentParser extends AbstractParser {
             //scrape everything under Thumbnails/ and Pictures/
             if (embeddedName.contains("Thumbnails/") ||
                     embeddedName.contains("Pictures/")) {
+                if (ignoreScriptFile(embeddedName)) {
+                    return;
+                }
+
                 EmbeddedDocumentExtractor embeddedDocumentExtractor =
                         EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
                 Metadata embeddedMetadata = new Metadata();
@@ -244,12 +252,51 @@ public class OpenDocumentParser extends AbstractParser {
                     embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                             TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
                 }
+
                 if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
                     embeddedDocumentExtractor.parseEmbedded(zip,
                             new EmbeddedContentHandler(handler), embeddedMetadata, false);
                 }
+            } else if (embeddedName.contains("Basic/")) {
+                Metadata embeddedMetadata = new Metadata();
+                embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+                        TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
+                String name = getMacroName(embeddedName);
+                if (!StringUtils.isAllBlank(name)) {
+                    embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
+                }
+                handler = new OpenDocumentMacroHandler(handler, context);
+                XMLReaderUtils.parseSAX(
+                        new CloseShieldInputStream(zip),
+                        new OfflineContentHandler(new EmbeddedContentHandler(
+                                handler)), context);
             }
 
         }
     }
+
+    private String getMacroName(String embeddedName) {
+
+        if (embeddedName == null) {
+            return null;
+        }
+        int lastSlash = embeddedName.lastIndexOf("/");
+        if (lastSlash > -1) {
+            return embeddedName.substring(lastSlash+1).replaceFirst("\\.xml$", "");
+        }
+        return null;
+    }
+
+    private boolean ignoreScriptFile(String embeddedName) {
+        if (embeddedName.contains("Basic/")) {
+            if (embeddedName.contains("script-lb.xml")) {
+                return true;
+            } else if (embeddedName.contains("script-lc.xml")) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+
 }
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
index d8ecedd..5f98b6c 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
@@ -392,6 +392,75 @@ public class ODFParserTest extends TikaTest {
     }
 
     @Test
+    public void testMacroODT() throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata("testODTMacro.odt");
+        assertEquals(4, metadataList.size());
+        Metadata parent = metadataList.get(0);
+
+        assertContains("<p>Hello dear user,</p>",
+                parent.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
+        assertEquals("application/vnd.oasis.opendocument.text",
+                parent.get(Metadata.CONTENT_TYPE));
+
+        //make sure metadata came through
+        assertEquals("LibreOffice/6.4.3.2$MacOSX_X86_64 LibreOffice_project/747b5d0ebf89f41c860ec2a39efd7cb15b54f2d8",
+                parent.get("generator"));
+        assertEquals(1, parent.getInt(PagedText.N_PAGES).intValue());
+
+        Metadata macro = metadataList.get(1);
+        assertEquals("MACRO", macro.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE_KEY));
+        assertContains("If WsGQFM Or 2 Then", macro.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
+        assertEquals("test", macro.get(TikaCoreProperties.RESOURCE_NAME_KEY));
+
+        Metadata image = metadataList.get(2);
+        assertEquals("image/png", image.get(Metadata.CONTENT_TYPE));
+    }
+
+    @Test
+    public void testMacroODS() throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata("testODSMacro.ods");
+        assertEquals(4, metadataList.size());
+        Metadata parent = metadataList.get(0);
+
+        assertContains("<tr>",
+                parent.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
+        assertEquals("application/vnd.oasis.opendocument.spreadsheet",
+                parent.get(Metadata.CONTENT_TYPE));
+
+        Metadata macro = metadataList.get(1);
+        assertEquals("MACRO", macro.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE_KEY));
+        assertContains("If WsGQFM Or 2 Then", macro.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
+        assertEquals("test1", macro.get(TikaCoreProperties.RESOURCE_NAME_KEY));
+
+        Metadata image = metadataList.get(2);
+        assertEquals("image/png", image.get(Metadata.CONTENT_TYPE));
+    }
+
+    @Test
+    public void testMacroODP() throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata("testODPMacro.odp");
+        assertEquals(3, metadataList.size());
+        Metadata parent = metadataList.get(0);
+
+        assertContains("<p",
+                parent.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
+        assertEquals("application/vnd.oasis.opendocument.presentation",
+                parent.get(Metadata.CONTENT_TYPE));
+        //make sure metadata came through
+        assertEquals("LibreOffice/6.4.3.2$MacOSX_X86_64 LibreOffice_project/747b5d0ebf89f41c860ec2a39efd7cb15b54f2d8",
+                parent.get("generator"));
+
+        assertEquals("2", parent.get("editing-cycles"));
+
+        Metadata macro = metadataList.get(1);
+        assertEquals("MACRO", macro.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE_KEY));
+        assertContains("If WsGQFM Or 2 Then", macro.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
+        assertEquals("testmodule", macro.get(TikaCoreProperties.RESOURCE_NAME_KEY));
+        assertEquals("testmodule", macro.get(TikaCoreProperties.RESOURCE_NAME_KEY));
+
+    }
+
+    @Test
     public void testMacroFODT() throws Exception {
         List<Metadata> metadataList = getRecursiveMetadata("testODTMacro.fodt");
         assertEquals(3, metadataList.size());
diff --git a/tika-parsers/src/test/resources/test-documents/testODPMacro.odp b/tika-parsers/src/test/resources/test-documents/testODPMacro.odp
new file mode 100644
index 0000000..35dee15
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testODPMacro.odp differ
diff --git a/tika-parsers/src/test/resources/test-documents/testODSMacro.ods b/tika-parsers/src/test/resources/test-documents/testODSMacro.ods
new file mode 100644
index 0000000..99a2bcf
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testODSMacro.ods differ
diff --git a/tika-parsers/src/test/resources/test-documents/testODTMacro.odt b/tika-parsers/src/test/resources/test-documents/testODTMacro.odt
new file mode 100644
index 0000000..6309e97
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testODTMacro.odt differ