You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/08/12 19:48:04 UTC
[tika] 03/03: TIKA-3161 -- extract macros from open document formats
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 611417f64e3d1848f0903951f624cc65a0aa5e27
Author: tallison <ta...@apache.org>
AuthorDate: Wed Aug 12 15:47:41 2020 -0400
TIKA-3161 -- extract macros from open document formats
---
CHANGES.txt | 22 +++++++
...dler.java => FlatOpenDocumentMacroHandler.java} | 15 +++--
.../tika/parser/odf/FlatOpenDocumentParser.java | 4 +-
.../tika/parser/odf/OpenDocumentMacroHandler.java | 62 ++----------------
.../apache/tika/parser/odf/OpenDocumentParser.java | 49 ++++++++++++++-
.../org/apache/tika/parser/odf/ODFParserTest.java | 69 +++++++++++++++++++++
.../test/resources/test-documents/testODPMacro.odp | Bin 0 -> 14505 bytes
.../test/resources/test-documents/testODSMacro.ods | Bin 0 -> 30726 bytes
.../test/resources/test-documents/testODTMacro.odt | Bin 0 -> 29912 bytes
9 files changed, 154 insertions(+), 67 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 66122b3..1f584d9 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -7,6 +7,28 @@ Release 2.0.0 - ???
Release 1.25 - ???
+ * Add detection and a parser for flat ODF files (TIKA-3159).
+
+ * Add extraction of macros from ODF files (TIKA-3161).
+
+ * Add mime detection for hprof and hprof text files (TIKA-3144).
+
+ * Add TextSignature and TextProfileSignature to tika-eval (TIKA-3145 and TIKA-3146)
+
+ * Create a metadata filter to trigger tika-eval stats post parsing (TIKA-3140)
+
+ * Add a configurable metadata-filter for the RecursiveParserWrapper (TIKA-3137)
+
+ * Add status endpoint to tika-server (TIKA-3129).
+
+ * Remove whitelist/blacklist terminology (TIKA-3120)
+
+ * Add detection for parquet files (TIKA-3115).
+
+ * Add detection and parsing for bplist (TIKA-3104).
+
+ * Enable metadata value filtering
+
* Add a basic parser for plist files based on com.googlecode.plist:dd-plist (TIKA-3104).
Release 1.24.1 - 4/17/2020
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMacroHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentMacroHandler.java
similarity index 90%
copy from tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMacroHandler.java
copy to tika-parsers/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentMacroHandler.java
index 14e4d8d..fefc824 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMacroHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentMacroHandler.java
@@ -34,20 +34,20 @@ import java.io.InputStream;
import java.nio.charset.StandardCharsets;
-class OpenDocumentMacroHandler extends ContentHandlerDecorator {
+class FlatOpenDocumentMacroHandler extends ContentHandlerDecorator {
- private static String MODULE = "module";
+ static String MODULE = "module";
private static String SOURCE_CODE = "source-code";
- private static String NAME = "name";
+ static String NAME = "name";
private final ContentHandler contentHandler;
private final ParseContext parseContext;
private EmbeddedDocumentExtractor embeddedDocumentExtractor;
private final StringBuilder macroBuffer = new StringBuilder();
- private String macroName = null;
- private boolean inMacro = false;
+ String macroName = null;
+ boolean inMacro = false;
- OpenDocumentMacroHandler(ContentHandler contentHandler, ParseContext parseContext) {
+ FlatOpenDocumentMacroHandler(ContentHandler contentHandler, ParseContext parseContext) {
super(contentHandler);
this.contentHandler = contentHandler;
this.parseContext = parseContext;
@@ -84,7 +84,7 @@ class OpenDocumentMacroHandler extends ContentHandlerDecorator {
}
}
- private void handleMacro() throws IOException, SAXException {
+ protected void handleMacro() throws IOException, SAXException {
byte[] bytes = macroBuffer.toString().getBytes(StandardCharsets.UTF_8);
@@ -110,6 +110,5 @@ class OpenDocumentMacroHandler extends ContentHandlerDecorator {
);
}
}
-
}
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentParser.java
index 442840c..04c7cd5 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentParser.java
@@ -41,6 +41,8 @@ import java.util.Set;
public class FlatOpenDocumentParser extends AbstractParser {
+ private static final long serialVersionUID = -8739250869531737584L;
+
static final MediaType FLAT_OD = MediaType.application("vnd.oasis.opendocument.tika.flat.document");
static final MediaType FLAT_ODT = MediaType.application("vnd.oasis.opendocument.flat.text");
@@ -115,7 +117,7 @@ public class FlatOpenDocumentParser extends AbstractParser {
this.parseContext = parseContext;
this.bodyHandler = new OpenDocumentBodyHandler(new NSNormalizerContentHandler(baseHandler), parseContext);
this.metadataHandler = OpenDocumentMetaParser.getContentHandler(metadata, parseContext);
- this.macroHandler = new OpenDocumentMacroHandler(baseHandler, parseContext);
+ this.macroHandler = new FlatOpenDocumentMacroHandler(baseHandler, parseContext);
}
MediaType getDetectedType() {
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMacroHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMacroHandler.java
index 14e4d8d..79c11ab 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMacroHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMacroHandler.java
@@ -34,48 +34,25 @@ import java.io.InputStream;
import java.nio.charset.StandardCharsets;
-class OpenDocumentMacroHandler extends ContentHandlerDecorator {
-
- private static String MODULE = "module";
- private static String SOURCE_CODE = "source-code";
- private static String NAME = "name";
-
- private final ContentHandler contentHandler;
- private final ParseContext parseContext;
- private EmbeddedDocumentExtractor embeddedDocumentExtractor;
- private final StringBuilder macroBuffer = new StringBuilder();
- private String macroName = null;
- private boolean inMacro = false;
+class OpenDocumentMacroHandler extends FlatOpenDocumentMacroHandler {
OpenDocumentMacroHandler(ContentHandler contentHandler, ParseContext parseContext) {
- super(contentHandler);
- this.contentHandler = contentHandler;
- this.parseContext = parseContext;
+ super(contentHandler, parseContext);
}
@Override
public void startElement(
String namespaceURI, String localName, String qName,
Attributes attrs) throws SAXException {
- if (MODULE.equals(localName)) {
- macroName = XMLReaderUtils.getAttrValue(NAME, attrs);
- } else if (SOURCE_CODE.equals(localName)) {
- inMacro = true;
- }
+ inMacro = true;
+ macroName = XMLReaderUtils.getAttrValue(NAME, attrs);
}
- @Override
- public void characters(char[] ch, int start, int length)
- throws SAXException {
- if (inMacro) {
- macroBuffer.append(ch, start, length);
- }
- }
@Override
public void endElement(
String namespaceURI, String localName, String qName) throws SAXException {
- if (SOURCE_CODE.equals(localName)) {
+ if (MODULE.equals(localName)) {
try {
handleMacro();
} catch (IOException e) {
@@ -83,33 +60,4 @@ class OpenDocumentMacroHandler extends ContentHandlerDecorator {
}
}
}
-
- private void handleMacro() throws IOException, SAXException {
-
- byte[] bytes = macroBuffer.toString().getBytes(StandardCharsets.UTF_8);
-
- if (embeddedDocumentExtractor == null) {
- embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(parseContext);
- }
- Metadata embeddedMetadata = new Metadata();
- if (! StringUtils.isBlank(macroName)) {
- embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, macroName);
- }
- embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
- TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
-
- //reset state before parse
- macroBuffer.setLength(0);
- macroName = null;
- inMacro = false;
-
- if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
- try (InputStream is = TikaInputStream.get(bytes)) {
- embeddedDocumentExtractor.parseEmbedded(
- is, contentHandler, embeddedMetadata, false
- );
- }
- }
-
- }
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
index 86ac3cf..a750a9b 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
@@ -30,6 +30,8 @@ import java.util.zip.ZipFile;
import java.util.zip.ZipInputStream;
import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.commons.lang3.StringUtils;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
@@ -42,7 +44,9 @@ import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.EndDocumentShieldingContentHandler;
+import org.apache.tika.sax.OfflineContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.XMLReaderUtils;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
@@ -204,7 +208,7 @@ public class OpenDocumentParser extends AbstractParser {
}
}
private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata,
- ParseContext context, EndDocumentShieldingContentHandler handler)
+ ParseContext context, ContentHandler handler)
throws IOException, SAXException, TikaException {
if (entry == null) return;
@@ -232,6 +236,10 @@ public class OpenDocumentParser extends AbstractParser {
//scrape everything under Thumbnails/ and Pictures/
if (embeddedName.contains("Thumbnails/") ||
embeddedName.contains("Pictures/")) {
+ if (ignoreScriptFile(embeddedName)) {
+ return;
+ }
+
EmbeddedDocumentExtractor embeddedDocumentExtractor =
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
Metadata embeddedMetadata = new Metadata();
@@ -244,12 +252,51 @@ public class OpenDocumentParser extends AbstractParser {
embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
}
+
if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
embeddedDocumentExtractor.parseEmbedded(zip,
new EmbeddedContentHandler(handler), embeddedMetadata, false);
}
+ } else if (embeddedName.contains("Basic/")) {
+ Metadata embeddedMetadata = new Metadata();
+ embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+ TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
+ String name = getMacroName(embeddedName);
+ if (!StringUtils.isAllBlank(name)) {
+ embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
+ }
+ handler = new OpenDocumentMacroHandler(handler, context);
+ XMLReaderUtils.parseSAX(
+ new CloseShieldInputStream(zip),
+ new OfflineContentHandler(new EmbeddedContentHandler(
+ handler)), context);
}
}
}
+
+ private String getMacroName(String embeddedName) {
+
+ if (embeddedName == null) {
+ return null;
+ }
+ int lastSlash = embeddedName.lastIndexOf("/");
+ if (lastSlash > -1) {
+ return embeddedName.substring(lastSlash+1).replaceFirst("\\.xml$", "");
+ }
+ return null;
+ }
+
+ private boolean ignoreScriptFile(String embeddedName) {
+ if (embeddedName.contains("Basic/")) {
+ if (embeddedName.contains("script-lb.xml")) {
+ return true;
+ } else if (embeddedName.contains("script-lc.xml")) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
index d8ecedd..5f98b6c 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
@@ -392,6 +392,75 @@ public class ODFParserTest extends TikaTest {
}
@Test
+ public void testMacroODT() throws Exception {
+ List<Metadata> metadataList = getRecursiveMetadata("testODTMacro.odt");
+ assertEquals(4, metadataList.size());
+ Metadata parent = metadataList.get(0);
+
+ assertContains("<p>Hello dear user,</p>",
+ parent.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
+ assertEquals("application/vnd.oasis.opendocument.text",
+ parent.get(Metadata.CONTENT_TYPE));
+
+ //make sure metadata came through
+ assertEquals("LibreOffice/6.4.3.2$MacOSX_X86_64 LibreOffice_project/747b5d0ebf89f41c860ec2a39efd7cb15b54f2d8",
+ parent.get("generator"));
+ assertEquals(1, parent.getInt(PagedText.N_PAGES).intValue());
+
+ Metadata macro = metadataList.get(1);
+ assertEquals("MACRO", macro.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE_KEY));
+ assertContains("If WsGQFM Or 2 Then", macro.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
+ assertEquals("test", macro.get(TikaCoreProperties.RESOURCE_NAME_KEY));
+
+ Metadata image = metadataList.get(2);
+ assertEquals("image/png", image.get(Metadata.CONTENT_TYPE));
+ }
+
+ @Test
+ public void testMacroODS() throws Exception {
+ List<Metadata> metadataList = getRecursiveMetadata("testODSMacro.ods");
+ assertEquals(4, metadataList.size());
+ Metadata parent = metadataList.get(0);
+
+ assertContains("<tr>",
+ parent.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
+ assertEquals("application/vnd.oasis.opendocument.spreadsheet",
+ parent.get(Metadata.CONTENT_TYPE));
+
+ Metadata macro = metadataList.get(1);
+ assertEquals("MACRO", macro.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE_KEY));
+ assertContains("If WsGQFM Or 2 Then", macro.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
+ assertEquals("test1", macro.get(TikaCoreProperties.RESOURCE_NAME_KEY));
+
+ Metadata image = metadataList.get(2);
+ assertEquals("image/png", image.get(Metadata.CONTENT_TYPE));
+ }
+
+ @Test
+ public void testMacroODP() throws Exception {
+ List<Metadata> metadataList = getRecursiveMetadata("testODPMacro.odp");
+ assertEquals(3, metadataList.size());
+ Metadata parent = metadataList.get(0);
+
+ assertContains("<p",
+ parent.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
+ assertEquals("application/vnd.oasis.opendocument.presentation",
+ parent.get(Metadata.CONTENT_TYPE));
+ //make sure metadata came through
+ assertEquals("LibreOffice/6.4.3.2$MacOSX_X86_64 LibreOffice_project/747b5d0ebf89f41c860ec2a39efd7cb15b54f2d8",
+ parent.get("generator"));
+
+ assertEquals("2", parent.get("editing-cycles"));
+
+ Metadata macro = metadataList.get(1);
+ assertEquals("MACRO", macro.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE_KEY));
+ assertContains("If WsGQFM Or 2 Then", macro.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
+ assertEquals("testmodule", macro.get(TikaCoreProperties.RESOURCE_NAME_KEY));
+ assertEquals("testmodule", macro.get(TikaCoreProperties.RESOURCE_NAME_KEY));
+
+ }
+
+ @Test
public void testMacroFODT() throws Exception {
List<Metadata> metadataList = getRecursiveMetadata("testODTMacro.fodt");
assertEquals(3, metadataList.size());
diff --git a/tika-parsers/src/test/resources/test-documents/testODPMacro.odp b/tika-parsers/src/test/resources/test-documents/testODPMacro.odp
new file mode 100644
index 0000000..35dee15
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testODPMacro.odp differ
diff --git a/tika-parsers/src/test/resources/test-documents/testODSMacro.ods b/tika-parsers/src/test/resources/test-documents/testODSMacro.ods
new file mode 100644
index 0000000..99a2bcf
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testODSMacro.ods differ
diff --git a/tika-parsers/src/test/resources/test-documents/testODTMacro.odt b/tika-parsers/src/test/resources/test-documents/testODTMacro.odt
new file mode 100644
index 0000000..6309e97
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testODTMacro.odt differ