You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/09/22 01:03:32 UTC
tika git commit: TIKA-2069 -- extract macros from MSOffice docs
Repository: tika
Updated Branches:
refs/heads/master 415381212 -> 2ae7206d9
TIKA-2069 -- extract macros from MSOffice docs
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/2ae7206d
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/2ae7206d
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/2ae7206d
Branch: refs/heads/master
Commit: 2ae7206d9c99fb553314cff21bb155d4e6f06d12
Parents: 4153812
Author: tballison <ta...@mitre.org>
Authored: Wed Sep 21 21:03:20 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Wed Sep 21 21:03:20 2016 -0400
----------------------------------------------------------------------
CHANGES.txt | 2 +
.../tika/metadata/TikaCoreProperties.java | 14 ++++-
.../tika/parser/microsoft/OfficeParser.java | 50 +++++++++++++++
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 20 +++++-
.../ooxml/XSLFPowerPointExtractorDecorator.java | 3 +
.../ooxml/XSSFExcelExtractorDecorator.java | 7 +++
.../tika/parser/microsoft/ExcelParserTest.java | 12 ++++
.../parser/microsoft/PowerPointParserTest.java | 13 ++++
.../tika/parser/microsoft/WordParserTest.java | 13 ++++
.../parser/microsoft/ooxml/OOXMLParserTest.java | 63 ++++++++++++++-----
.../test-documents/testEXCEL_macro.xls | Bin 0 -> 30720 bytes
.../test-documents/testEXCEL_macro.xlsm | Bin 0 -> 14561 bytes
.../resources/test-documents/testPPT_macros.ppt | Bin 0 -> 88064 bytes
.../test-documents/testPPT_macros.pptm | Bin 0 -> 32824 bytes
.../test-documents/testWORD_macros.doc | Bin 0 -> 38400 bytes
.../test-documents/testWORD_macros.docm | Bin 0 -> 17322 bytes
16 files changed, 178 insertions(+), 19 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index fc94e70..9a03b01 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
Release 1.14 - ???
+ * Extract macros from MSOffice files (TIKA-2069).
+
* Maintain passed-in mime in TXTParser (TIKA-2047).
* Upgrade to POI.3-15 (TIKA-2013).
http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index f4b97dd..9245086 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -39,16 +39,24 @@ public interface TikaCoreProperties {
/**
* A file might contain different types of embedded documents.
- * The most common is the ATTACHEMENT.
+ * The most common is the ATTACHMENT.
+ * <p>
* An INLINE embedded resource should be used for embedded image
* files that are used to render the page image (as in PDXObjImages in PDF files).
* <p>
- * Not all parsers have yet implemented this.
+ * A MACRO is code that is embedded in the document and is intended
+ * to be executable within the application that opens the document. This
+ * includes traditional macros within Microsoft Office files and
+ * javascript within PDFActions. This would not include, e.g., an
+ * .exe file embedded in a .zip file.
+ * <p>
+ * Not all parsers have yet implemented this.
*
*/
public enum EmbeddedResourceType {
INLINE,
- ATTACHMENT
+ ATTACHMENT,
+ MACRO
};
/**
http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
index b6681aa..f7f1c4a 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
@@ -16,13 +16,16 @@
*/
package org.apache.tika.parser.microsoft;
+import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
import java.security.GeneralSecurityException;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Locale;
+import java.util.Map;
import java.util.Set;
import org.apache.commons.io.input.CloseShieldInputStream;
@@ -35,11 +38,15 @@ import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.poifs.macros.VBAMacroReader;
import org.apache.poi.util.IOUtils;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
@@ -117,9 +124,17 @@ public class OfficeParser extends AbstractParser {
//tstream will close the fs, no need to close this below
tstream.setOpenContainer(fs);
root = fs.getRoot();
+
}
}
parse(root, context, metadata, xhtml);
+
+ //now try to get macros
+ EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
+ if (ex == null) {
+ ex = new ParsingEmbeddedDocumentExtractor(context);
+ }
+ extractMacros(root.getNFileSystem(), xhtml, ex);
} finally {
IOUtils.closeQuietly(mustCloseFs);
}
@@ -279,4 +294,39 @@ public class OfficeParser extends AbstractParser {
}
}
+ /**
+ * Helper to extract macros from an NPOIFS/vbaProject.bin
+ *
+ * As of POI-3.15-final, there are still some bugs in VBAMacroReader.
+ * For now, we are swallowing NPE and other runtime exceptions
+ *
+ * @param fs NPOIFS to extract from
+ * @param xhtml SAX writer
+ * @param embeddedDocumentExtractor extractor for embedded documents
+ * @throws IOException on IOException if it occurs during the extraction of the embedded doc
+ * @throws SAXException on SAXException for writing to xhtml
+ */
+ public static void extractMacros(NPOIFSFileSystem fs, ContentHandler xhtml, EmbeddedDocumentExtractor
+ embeddedDocumentExtractor) throws IOException, SAXException {
+
+ VBAMacroReader reader = null;
+ Map<String, String> macros = null;
+ try {
+ reader = new VBAMacroReader(fs);
+ macros = reader.readMacros();
+ } catch (Exception e) {
+ //swallow
+ return;
+ }
+ for (Map.Entry<String, String> e : macros.entrySet()) {
+ Metadata m = new Metadata();
+ m.set(Metadata.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
+ m.set(Metadata.CONTENT_TYPE, "text/x-vbasic");
+ if (embeddedDocumentExtractor.shouldParseEmbedded(m)) {
+ embeddedDocumentExtractor.parseEmbedded(
+ new ByteArrayInputStream(e.getValue().getBytes(StandardCharsets.UTF_8)), xhtml, m, true);
+ }
+ }
+ }
+
}
http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index 67468b0..1f16a3c 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -33,6 +33,7 @@ import org.apache.poi.openxml4j.opc.PackageRelationship;
import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
import org.apache.poi.openxml4j.opc.TargetMode;
import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.Ole10Native;
import org.apache.poi.poifs.filesystem.Ole10NativeException;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
@@ -43,6 +44,7 @@ import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.microsoft.OfficeParser;
import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
@@ -64,7 +66,8 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
static final String RELATION_IMAGE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image";
static final String RELATION_OLE_OBJECT = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/oleObject";
static final String RELATION_PACKAGE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/package";
-
+ static final String RELATION_MACRO = "http://schemas.microsoft.com/office/2006/relationships/vbaProject";
+ static final String RELATION_OFFICE_DOCUMENT = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument";
private static final String TYPE_OLE_OBJECT =
"application/vnd.openxmlformats-officedocument.oleObject";
private final EmbeddedDocumentExtractor embeddedExtractor;
@@ -197,6 +200,8 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
|| RELATION_PACKAGE.equals(type)
|| RELATION_OLE_OBJECT.equals(type)) {
handleEmbeddedFile(target, handler, sourceDesc + rel.getId());
+ } else if (RELATION_MACRO.equals(type)) {
+ handleMacros(target, handler);
}
}
}
@@ -325,4 +330,17 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
*/
protected abstract List<PackagePart> getMainDocumentParts()
throws TikaException;
+
+
+ void handleMacros(PackagePart macroPart, ContentHandler handler) throws TikaException, SAXException {
+
+ try (InputStream is = macroPart.getInputStream()) {
+ try (NPOIFSFileSystem npoifs = new NPOIFSFileSystem(is)) {
+ //Macro reading exceptions are already swallowed here
+ OfficeParser.extractMacros(npoifs, handler, embeddedExtractor);
+ }
+ } catch (IOException e) {
+ throw new TikaException("Broken OOXML file", e);
+ }
+ }
}
http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
index 0ea58c0..160f761 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
@@ -280,6 +280,9 @@ public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
}
}
}
+ //add full document to include macros
+ parts.add(document.getPackagePart());
+
return parts;
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
index ae8b6cb..0f6957c 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
@@ -320,6 +320,13 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
}
}
+ //add main document so that macros can be extracted
+ //by AbstractOOXMLExtractor
+ for (PackagePart part : extractor.getPackage().
+ getPartsByRelationshipType(RELATION_OFFICE_DOCUMENT)) {
+ parts.add(part);
+ }
+
return parts;
}
http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index cb93b55..eb1a814 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -472,4 +472,16 @@ public class ExcelParserTest extends TikaTest {
assertContains("1.23456789012345E15", xml);//16 digit number is treated as scientific notation
assertContains("1.23456789012345E15", xml);//16 digit formula, ditto
}
+
+ @Test
+ public void testMacroinXls() throws Exception {
+ List<Metadata> metadataList = getRecursiveMetadata("testEXCEL_macro.xls");
+ Metadata macroMetadata = metadataList.get(1);
+ assertContains("Sub Dirty()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+ assertContains("dirty dirt dirt", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+ assertContains("text/x-vbasic", macroMetadata.get(Metadata.CONTENT_TYPE));
+ assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(),
+ macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+ }
+
}
http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
index ca20be7..41400c5 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
@@ -30,6 +30,7 @@ import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Ignore;
import org.junit.Test;
import org.xml.sax.ContentHandler;
@@ -249,4 +250,16 @@ public class PowerPointParserTest extends TikaTest {
assertContains("Hello World", metadataList.get(2).get(RecursiveParserWrapper.TIKA_CONTENT));
assertEquals("4.pdf", metadataList.get(2).get(Metadata.RESOURCE_NAME_KEY));
}
+
+ @Test
+ @Ignore("POI 3.15-final not finding any macros in this ppt")
+ public void testMacros() throws Exception {
+ List<Metadata> metadataList = getRecursiveMetadata("testPPT_macros.ppt");
+ Metadata macroMetadata = metadataList.get(1);
+ assertContains("Sub Embolden()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+ assertContains("Sub Italicize()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+ assertContains("text/x-vbasic", macroMetadata.get(Metadata.CONTENT_TYPE));
+ assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(),
+ macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+ }
}
http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index 8b42ff1..e63a61b 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -34,6 +34,7 @@ import org.apache.tika.metadata.OfficeOpenXMLCore;
import org.apache.tika.metadata.OfficeOpenXMLExtended;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.sax.BodyContentHandler;
import org.junit.Ignore;
import org.junit.Test;
@@ -520,5 +521,17 @@ public class WordParserTest extends TikaTest {
assertContains("<a href=\"http://tika.apache.org/\">hyper <b>link</b></a>", xml);
assertContains("<a href=\"http://tika.apache.org/\"><b>hyper</b> link</a>; bold" , xml);
}
+
+ @Test
+ public void testMacros() throws Exception {
+ //debug(getRecursiveMetadata("SimpleMacro.doc"));
+ List<Metadata> metadataList = getRecursiveMetadata("testWORD_macros.doc");
+ Metadata macroMetadata = metadataList.get(1);
+ assertContains("Sub Embolden()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+ assertContains("Sub Italicize()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+ assertContains("text/x-vbasic", macroMetadata.get(Metadata.CONTENT_TYPE));
+ assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(),
+ macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+ }
}
http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index ac62b03..ccfb293 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -578,39 +578,39 @@ public class OOXMLParserTest extends TikaTest {
assertContains("Here is a citation:", content);
assertContains("Figure 1 This is a caption for Figure 1", content);
assertContains("(Kramer)", content);
- assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+"," "));
- assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+"," "));
+ assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+", " "));
+ assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+", " "));
assertContains("This is a hyperlink", content);
assertContains("Here is a list:", content);
- for(int row=1;row<=3;row++) {
+ for (int row = 1; row <= 3; row++) {
//assertContains("�\tBullet " + row, content);
//assertContains("\u00b7\tBullet " + row, content);
assertContains("Bullet " + row, content);
}
assertContains("Here is a numbered list:", content);
- for(int row=1;row<=3;row++) {
+ for (int row = 1; row <= 3; row++) {
//assertContains(row + ")\tNumber bullet " + row, content);
//assertContains(row + ") Number bullet " + row, content);
// TODO: OOXMLExtractor fails to number the bullets:
assertContains("Number bullet " + row, content);
}
- for(int row=1;row<=2;row++) {
- for(int col=1;col<=3;col++) {
+ for (int row = 1; row <= 2; row++) {
+ for (int col = 1; col <= 3; col++) {
assertContains("Row " + row + " Col " + col, content);
}
}
assertContains("Keyword1 Keyword2", content);
assertEquals("Keyword1 Keyword2",
- metadata.get(Metadata.KEYWORDS));
+ metadata.get(Metadata.KEYWORDS));
assertContains("Subject is here", content);
// TODO: Remove subject in Tika 2.0
assertEquals("Subject is here",
- metadata.get(Metadata.SUBJECT));
+ metadata.get(Metadata.SUBJECT));
assertEquals("Subject is here",
- metadata.get(OfficeOpenXMLCore.SUBJECT));
+ metadata.get(OfficeOpenXMLCore.SUBJECT));
assertContains("Suddenly some Japanese text:", content);
// Special version of (GHQ)
@@ -642,21 +642,21 @@ public class OOXMLParserTest extends TikaTest {
assertContains("<p>Row 2 column 2</p>", xml);
assertContains("<p><a href=\"http://tika.apache.org/\">This is a hyperlink</a>", xml);
assertContains("<p>Here is a list:", xml);
- for(int row=1;row<=3;row++) {
+ for (int row = 1; row <= 3; row++) {
//assertContains("�\tBullet " + row, content);
//assertContains("\u00b7\tBullet " + row, content);
assertContains("<p>Bullet " + row, xml);
}
assertContains("Here is a numbered list:", xml);
- for(int row=1;row<=3;row++) {
+ for (int row = 1; row <= 3; row++) {
//assertContains(row + ")\tNumber bullet " + row, content);
//assertContains(row + ") Number bullet " + row, content);
// TODO: OOXMLExtractor fails to number the bullets:
assertContains("<p>Number bullet " + row, xml);
}
- for(int row=1;row<=2;row++) {
- for(int col=1;col<=3;col++) {
+ for (int row = 1; row <= 2; row++) {
+ for (int col = 1; col <= 3; col++) {
assertContains("Row " + row + " Col " + col, xml);
}
}
@@ -668,7 +668,7 @@ public class OOXMLParserTest extends TikaTest {
assertContains("Subject is here", xml);
// TODO: Remove subject in Tika 2.0
assertEquals("Subject is here",
- metadata.get(Metadata.SUBJECT));
+ metadata.get(Metadata.SUBJECT));
assertEquals("Subject is here",
metadata.get(OfficeOpenXMLCore.SUBJECT));
@@ -1254,7 +1254,7 @@ public class OOXMLParserTest extends TikaTest {
String xml = getXML("testWORD_boldHyperlink.docx").xml;
xml = xml.replaceAll("\\s+", " ");
assertContains("<a href=\"http://tika.apache.org/\">hyper <b>link</b></a>", xml);
- assertContains("<a href=\"http://tika.apache.org/\"><b>hyper</b> link</a>; bold" , xml);
+ assertContains("<a href=\"http://tika.apache.org/\"><b>hyper</b> link</a>; bold", xml);
}
@Test
@@ -1263,6 +1263,39 @@ public class OOXMLParserTest extends TikaTest {
assertContains("bold", getXML("testWORD_totalTimeOutOfRange.docx").xml);
}
+ @Test
+ public void testMacrosInDocm() throws Exception {
+ List<Metadata> metadataList = getRecursiveMetadata("testWORD_macros.docm");
+ Metadata macroMetadata = metadataList.get(1);
+ assertContains("Sub Embolden()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+ assertContains("Sub Italicize()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+ assertContains("text/x-vbasic", macroMetadata.get(Metadata.CONTENT_TYPE));
+ assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(),
+ macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+ }
+
+ @Test
+ public void testMacrosInPptm() throws Exception {
+ List<Metadata> metadataList = getRecursiveMetadata("testPPT_macros.pptm");
+ Metadata macroMetadata = metadataList.get(1);
+ assertContains("Sub Embolden()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+ assertContains("Sub Italicize()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+ assertContains("text/x-vbasic", macroMetadata.get(Metadata.CONTENT_TYPE));
+ assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(),
+ macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+ }
+
+ @Test
+ public void testMacroinXlsm() throws Exception {
+ List<Metadata> metadataList = getRecursiveMetadata("testEXCEL_macro.xlsm");
+ Metadata macroMetadata = metadataList.get(1);
+ assertContains("Sub Dirty()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+ assertContains("dirty dirt dirt", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+ assertContains("text/x-vbasic", macroMetadata.get(Metadata.CONTENT_TYPE));
+ assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(),
+ macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+ }
+
}
http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-parsers/src/test/resources/test-documents/testEXCEL_macro.xls
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testEXCEL_macro.xls b/tika-parsers/src/test/resources/test-documents/testEXCEL_macro.xls
new file mode 100644
index 0000000..b97f9b2
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testEXCEL_macro.xls differ
http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-parsers/src/test/resources/test-documents/testEXCEL_macro.xlsm
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testEXCEL_macro.xlsm b/tika-parsers/src/test/resources/test-documents/testEXCEL_macro.xlsm
new file mode 100644
index 0000000..d21452b
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testEXCEL_macro.xlsm differ
http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-parsers/src/test/resources/test-documents/testPPT_macros.ppt
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testPPT_macros.ppt b/tika-parsers/src/test/resources/test-documents/testPPT_macros.ppt
new file mode 100644
index 0000000..7af9008
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testPPT_macros.ppt differ
http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-parsers/src/test/resources/test-documents/testPPT_macros.pptm
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testPPT_macros.pptm b/tika-parsers/src/test/resources/test-documents/testPPT_macros.pptm
new file mode 100644
index 0000000..058a039
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testPPT_macros.pptm differ
http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-parsers/src/test/resources/test-documents/testWORD_macros.doc
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testWORD_macros.doc b/tika-parsers/src/test/resources/test-documents/testWORD_macros.doc
new file mode 100644
index 0000000..838d86b
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testWORD_macros.doc differ
http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-parsers/src/test/resources/test-documents/testWORD_macros.docm
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testWORD_macros.docm b/tika-parsers/src/test/resources/test-documents/testWORD_macros.docm
new file mode 100644
index 0000000..a915310
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testWORD_macros.docm differ