You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/09/22 01:03:32 UTC

tika git commit: TIKA-2069 -- extract macros from MSOffice docs

Repository: tika
Updated Branches:
  refs/heads/master 415381212 -> 2ae7206d9


TIKA-2069 -- extract macros from MSOffice docs


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/2ae7206d
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/2ae7206d
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/2ae7206d

Branch: refs/heads/master
Commit: 2ae7206d9c99fb553314cff21bb155d4e6f06d12
Parents: 4153812
Author: tballison <ta...@mitre.org>
Authored: Wed Sep 21 21:03:20 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Wed Sep 21 21:03:20 2016 -0400

----------------------------------------------------------------------
 CHANGES.txt                                     |   2 +
 .../tika/metadata/TikaCoreProperties.java       |  14 ++++-
 .../tika/parser/microsoft/OfficeParser.java     |  50 +++++++++++++++
 .../microsoft/ooxml/AbstractOOXMLExtractor.java |  20 +++++-
 .../ooxml/XSLFPowerPointExtractorDecorator.java |   3 +
 .../ooxml/XSSFExcelExtractorDecorator.java      |   7 +++
 .../tika/parser/microsoft/ExcelParserTest.java  |  12 ++++
 .../parser/microsoft/PowerPointParserTest.java  |  13 ++++
 .../tika/parser/microsoft/WordParserTest.java   |  13 ++++
 .../parser/microsoft/ooxml/OOXMLParserTest.java |  63 ++++++++++++++-----
 .../test-documents/testEXCEL_macro.xls          | Bin 0 -> 30720 bytes
 .../test-documents/testEXCEL_macro.xlsm         | Bin 0 -> 14561 bytes
 .../resources/test-documents/testPPT_macros.ppt | Bin 0 -> 88064 bytes
 .../test-documents/testPPT_macros.pptm          | Bin 0 -> 32824 bytes
 .../test-documents/testWORD_macros.doc          | Bin 0 -> 38400 bytes
 .../test-documents/testWORD_macros.docm         | Bin 0 -> 17322 bytes
 16 files changed, 178 insertions(+), 19 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index fc94e70..9a03b01 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
 Release 1.14 - ???
 
+  * Extract macros from MSOffice files (TIKA-2069).
+
   * Maintain passed-in mime in TXTParser (TIKA-2047).
 
   * Upgrade to POI.3-15 (TIKA-2013).

http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index f4b97dd..9245086 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -39,16 +39,24 @@ public interface TikaCoreProperties {
 
     /**
      * A file might contain different types of embedded documents.
-     * The most common is the ATTACHEMENT.
+     * The most common is the ATTACHMENT.
+     * <p>
      * An INLINE embedded resource should be used for embedded image
      * files that are used to render the page image (as in PDXObjImages in PDF files).
      * <p>
-     * Not all parsers have yet implemented this. 
+     * A MACRO is code that is embedded in the document and is intended
+     * to be executable within the application that opens the document.  This
+     * includes traditional macros within Microsoft Office files and
+     * javascript within PDFActions.  This would not include, e.g., an
+     * .exe file embedded in a .zip file.
+     * <p>
+     * Not all parsers have yet implemented this.
      *
      */
     public enum EmbeddedResourceType {
         INLINE,
-        ATTACHMENT
+        ATTACHMENT,
+        MACRO
     };
 
     /**

http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
index b6681aa..f7f1c4a 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
@@ -16,13 +16,16 @@
  */
 package org.apache.tika.parser.microsoft;
 
+import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
 import java.security.GeneralSecurityException;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashSet;
 import java.util.Locale;
+import java.util.Map;
 import java.util.Set;
 
 import org.apache.commons.io.input.CloseShieldInputStream;
@@ -35,11 +38,15 @@ import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.poifs.macros.VBAMacroReader;
 import org.apache.poi.util.IOUtils;
 import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
@@ -117,9 +124,17 @@ public class OfficeParser extends AbstractParser {
                     //tstream will close the fs, no need to close this below
                     tstream.setOpenContainer(fs);
                     root = fs.getRoot();
+
                 }
             }
             parse(root, context, metadata, xhtml);
+
+            //now try to get macros
+            EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
+            if (ex == null) {
+                ex = new ParsingEmbeddedDocumentExtractor(context);
+            }
+            extractMacros(root.getNFileSystem(), xhtml, ex);
         } finally {
             IOUtils.closeQuietly(mustCloseFs);
         }
@@ -279,4 +294,39 @@ public class OfficeParser extends AbstractParser {
         }
     }
 
+    /**
+     * Helper to extract macros from an NPOIFS/vbaProject.bin
+     *
+     * As of POI-3.15-final, there are still some bugs in VBAMacroReader.
+     * For now, we are swallowing NPE and other runtime exceptions
+     *
+     * @param fs NPOIFS to extract from
+     * @param xhtml SAX writer
+     * @param embeddedDocumentExtractor extractor for embedded documents
+     * @throws IOException on IOException if it occurs during the extraction of the embedded doc
+     * @throws SAXException on SAXException for writing to xhtml
+     */
+    public static void extractMacros(NPOIFSFileSystem fs, ContentHandler xhtml, EmbeddedDocumentExtractor
+            embeddedDocumentExtractor)  throws IOException, SAXException {
+
+        VBAMacroReader reader = null;
+        Map<String, String> macros = null;
+        try {
+            reader = new VBAMacroReader(fs);
+            macros = reader.readMacros();
+        } catch (Exception e) {
+            //swallow
+            return;
+        }
+        for (Map.Entry<String, String> e : macros.entrySet()) {
+            Metadata m = new Metadata();
+            m.set(Metadata.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
+            m.set(Metadata.CONTENT_TYPE, "text/x-vbasic");
+            if (embeddedDocumentExtractor.shouldParseEmbedded(m)) {
+                embeddedDocumentExtractor.parseEmbedded(
+                        new ByteArrayInputStream(e.getValue().getBytes(StandardCharsets.UTF_8)), xhtml, m, true);
+            }
+        }
+    }
+
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index 67468b0..1f16a3c 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -33,6 +33,7 @@ import org.apache.poi.openxml4j.opc.PackageRelationship;
 import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
 import org.apache.poi.openxml4j.opc.TargetMode;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
 import org.apache.poi.poifs.filesystem.Ole10Native;
 import org.apache.poi.poifs.filesystem.Ole10NativeException;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
@@ -43,6 +44,7 @@ import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.microsoft.OfficeParser;
 import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
 import org.apache.tika.sax.EmbeddedContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
@@ -64,7 +66,8 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
     static final String RELATION_IMAGE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image";
     static final String RELATION_OLE_OBJECT = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/oleObject";
     static final String RELATION_PACKAGE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/package";
-
+    static final String RELATION_MACRO = "http://schemas.microsoft.com/office/2006/relationships/vbaProject";
+    static final String RELATION_OFFICE_DOCUMENT = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument";
     private static final String TYPE_OLE_OBJECT =
             "application/vnd.openxmlformats-officedocument.oleObject";
     private final EmbeddedDocumentExtractor embeddedExtractor;
@@ -197,6 +200,8 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
                                 || RELATION_PACKAGE.equals(type)
                                 || RELATION_OLE_OBJECT.equals(type)) {
                             handleEmbeddedFile(target, handler, sourceDesc + rel.getId());
+                        } else if (RELATION_MACRO.equals(type)) {
+                            handleMacros(target, handler);
                         }
                     }
                 }
@@ -325,4 +330,17 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
      */
     protected abstract List<PackagePart> getMainDocumentParts()
             throws TikaException;
+
+
+    void handleMacros(PackagePart macroPart, ContentHandler handler) throws TikaException, SAXException {
+
+        try (InputStream is = macroPart.getInputStream()) {
+            try (NPOIFSFileSystem npoifs = new NPOIFSFileSystem(is)) {
+                //Macro reading exceptions are already swallowed here
+                OfficeParser.extractMacros(npoifs, handler, embeddedExtractor);
+            }
+        } catch (IOException e) {
+            throw new TikaException("Broken OOXML file", e);
+        }
+    }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
index 0ea58c0..160f761 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
@@ -280,6 +280,9 @@ public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
                 }
             }
         }
+        //add full document to include macros
+        parts.add(document.getPackagePart());
+
         return parts;
     }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
index ae8b6cb..0f6957c 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
@@ -320,6 +320,13 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
             }
         }
 
+        //add main document so that macros can be extracted
+        //by AbstractOOXMLExtractor
+        for (PackagePart part : extractor.getPackage().
+                getPartsByRelationshipType(RELATION_OFFICE_DOCUMENT)) {
+            parts.add(part);
+        }
+
         return parts;
     }
 

http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index cb93b55..eb1a814 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -472,4 +472,16 @@ public class ExcelParserTest extends TikaTest {
         assertContains("1.23456789012345E15", xml);//16 digit number is treated as scientific notation
         assertContains("1.23456789012345E15", xml);//16 digit formula, ditto
     }
+
+    @Test
+    public void testMacroinXls() throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata("testEXCEL_macro.xls");
+        Metadata macroMetadata = metadataList.get(1);
+        assertContains("Sub Dirty()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertContains("dirty dirt dirt", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertContains("text/x-vbasic", macroMetadata.get(Metadata.CONTENT_TYPE));
+        assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(),
+                macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+    }
+
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
index ca20be7..41400c5 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
@@ -30,6 +30,7 @@ import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Ignore;
 import org.junit.Test;
 import org.xml.sax.ContentHandler;
 
@@ -249,4 +250,16 @@ public class PowerPointParserTest extends TikaTest {
         assertContains("Hello World", metadataList.get(2).get(RecursiveParserWrapper.TIKA_CONTENT));
         assertEquals("4.pdf", metadataList.get(2).get(Metadata.RESOURCE_NAME_KEY));
     }
+
+    @Test
+    @Ignore("POI 3.15-final not finding any macros in this ppt")
+    public void testMacros() throws  Exception {
+        List<Metadata> metadataList = getRecursiveMetadata("testPPT_macros.ppt");
+        Metadata macroMetadata = metadataList.get(1);
+        assertContains("Sub Embolden()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertContains("Sub Italicize()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertContains("text/x-vbasic", macroMetadata.get(Metadata.CONTENT_TYPE));
+        assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(),
+                macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+    }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index 8b42ff1..e63a61b 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -34,6 +34,7 @@ import org.apache.tika.metadata.OfficeOpenXMLCore;
 import org.apache.tika.metadata.OfficeOpenXMLExtended;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.sax.BodyContentHandler;
 import org.junit.Ignore;
 import org.junit.Test;
@@ -520,5 +521,17 @@ public class WordParserTest extends TikaTest {
         assertContains("<a href=\"http://tika.apache.org/\">hyper <b>link</b></a>", xml);
         assertContains("<a href=\"http://tika.apache.org/\"><b>hyper</b> link</a>; bold" , xml);
     }
+
+    @Test
+    public void testMacros() throws  Exception {
+        //debug(getRecursiveMetadata("SimpleMacro.doc"));
+        List<Metadata> metadataList = getRecursiveMetadata("testWORD_macros.doc");
+        Metadata macroMetadata = metadataList.get(1);
+        assertContains("Sub Embolden()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertContains("Sub Italicize()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertContains("text/x-vbasic", macroMetadata.get(Metadata.CONTENT_TYPE));
+        assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(),
+                macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+    }
 }
 

http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index ac62b03..ccfb293 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -578,39 +578,39 @@ public class OOXMLParserTest extends TikaTest {
         assertContains("Here is a citation:", content);
         assertContains("Figure 1 This is a caption for Figure 1", content);
         assertContains("(Kramer)", content);
-        assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+"," "));
-        assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+"," "));
+        assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+", " "));
+        assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+", " "));
         assertContains("This is a hyperlink", content);
         assertContains("Here is a list:", content);
-        for(int row=1;row<=3;row++) {
+        for (int row = 1; row <= 3; row++) {
             //assertContains("�\tBullet " + row, content);
             //assertContains("\u00b7\tBullet " + row, content);
             assertContains("Bullet " + row, content);
         }
         assertContains("Here is a numbered list:", content);
-        for(int row=1;row<=3;row++) {
+        for (int row = 1; row <= 3; row++) {
             //assertContains(row + ")\tNumber bullet " + row, content);
             //assertContains(row + ") Number bullet " + row, content);
             // TODO: OOXMLExtractor fails to number the bullets:
             assertContains("Number bullet " + row, content);
         }
 
-        for(int row=1;row<=2;row++) {
-            for(int col=1;col<=3;col++) {
+        for (int row = 1; row <= 2; row++) {
+            for (int col = 1; col <= 3; col++) {
                 assertContains("Row " + row + " Col " + col, content);
             }
         }
 
         assertContains("Keyword1 Keyword2", content);
         assertEquals("Keyword1 Keyword2",
-                     metadata.get(Metadata.KEYWORDS));
+                metadata.get(Metadata.KEYWORDS));
 
         assertContains("Subject is here", content);
         // TODO: Remove subject in Tika 2.0
         assertEquals("Subject is here",
-                     metadata.get(Metadata.SUBJECT));
+                metadata.get(Metadata.SUBJECT));
         assertEquals("Subject is here",
-                     metadata.get(OfficeOpenXMLCore.SUBJECT));
+                metadata.get(OfficeOpenXMLCore.SUBJECT));
 
         assertContains("Suddenly some Japanese text:", content);
         // Special version of (GHQ)
@@ -642,21 +642,21 @@ public class OOXMLParserTest extends TikaTest {
         assertContains("<p>Row 2 column 2</p>", xml);
         assertContains("<p><a href=\"http://tika.apache.org/\">This is a hyperlink</a>", xml);
         assertContains("<p>Here is a list:", xml);
-        for(int row=1;row<=3;row++) {
+        for (int row = 1; row <= 3; row++) {
             //assertContains("�\tBullet " + row, content);
             //assertContains("\u00b7\tBullet " + row, content);
             assertContains("<p>Bullet " + row, xml);
         }
         assertContains("Here is a numbered list:", xml);
-        for(int row=1;row<=3;row++) {
+        for (int row = 1; row <= 3; row++) {
             //assertContains(row + ")\tNumber bullet " + row, content);
             //assertContains(row + ") Number bullet " + row, content);
             // TODO: OOXMLExtractor fails to number the bullets:
             assertContains("<p>Number bullet " + row, xml);
         }
 
-        for(int row=1;row<=2;row++) {
-            for(int col=1;col<=3;col++) {
+        for (int row = 1; row <= 2; row++) {
+            for (int col = 1; col <= 3; col++) {
                 assertContains("Row " + row + " Col " + col, xml);
             }
         }
@@ -668,7 +668,7 @@ public class OOXMLParserTest extends TikaTest {
         assertContains("Subject is here", xml);
         // TODO: Remove subject in Tika 2.0
         assertEquals("Subject is here",
-                     metadata.get(Metadata.SUBJECT));
+                metadata.get(Metadata.SUBJECT));
         assertEquals("Subject is here",
                 metadata.get(OfficeOpenXMLCore.SUBJECT));
 
@@ -1254,7 +1254,7 @@ public class OOXMLParserTest extends TikaTest {
         String xml = getXML("testWORD_boldHyperlink.docx").xml;
         xml = xml.replaceAll("\\s+", " ");
         assertContains("<a href=\"http://tika.apache.org/\">hyper <b>link</b></a>", xml);
-        assertContains("<a href=\"http://tika.apache.org/\"><b>hyper</b> link</a>; bold" , xml);
+        assertContains("<a href=\"http://tika.apache.org/\"><b>hyper</b> link</a>; bold", xml);
     }
 
     @Test
@@ -1263,6 +1263,39 @@ public class OOXMLParserTest extends TikaTest {
         assertContains("bold", getXML("testWORD_totalTimeOutOfRange.docx").xml);
     }
 
+    @Test
+    public void testMacrosInDocm() throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata("testWORD_macros.docm");
+        Metadata macroMetadata = metadataList.get(1);
+        assertContains("Sub Embolden()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertContains("Sub Italicize()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertContains("text/x-vbasic", macroMetadata.get(Metadata.CONTENT_TYPE));
+        assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(),
+                macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+    }
+
+    @Test
+    public void testMacrosInPptm() throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata("testPPT_macros.pptm");
+        Metadata macroMetadata = metadataList.get(1);
+        assertContains("Sub Embolden()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertContains("Sub Italicize()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertContains("text/x-vbasic", macroMetadata.get(Metadata.CONTENT_TYPE));
+        assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(),
+                macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+    }
+
+    @Test
+    public void testMacroinXlsm() throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata("testEXCEL_macro.xlsm");
+        Metadata macroMetadata = metadataList.get(1);
+        assertContains("Sub Dirty()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertContains("dirty dirt dirt", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertContains("text/x-vbasic", macroMetadata.get(Metadata.CONTENT_TYPE));
+        assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(),
+                macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+    }
+
 }
 
 

http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-parsers/src/test/resources/test-documents/testEXCEL_macro.xls
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testEXCEL_macro.xls b/tika-parsers/src/test/resources/test-documents/testEXCEL_macro.xls
new file mode 100644
index 0000000..b97f9b2
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testEXCEL_macro.xls differ

http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-parsers/src/test/resources/test-documents/testEXCEL_macro.xlsm
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testEXCEL_macro.xlsm b/tika-parsers/src/test/resources/test-documents/testEXCEL_macro.xlsm
new file mode 100644
index 0000000..d21452b
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testEXCEL_macro.xlsm differ

http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-parsers/src/test/resources/test-documents/testPPT_macros.ppt
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testPPT_macros.ppt b/tika-parsers/src/test/resources/test-documents/testPPT_macros.ppt
new file mode 100644
index 0000000..7af9008
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testPPT_macros.ppt differ

http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-parsers/src/test/resources/test-documents/testPPT_macros.pptm
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testPPT_macros.pptm b/tika-parsers/src/test/resources/test-documents/testPPT_macros.pptm
new file mode 100644
index 0000000..058a039
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testPPT_macros.pptm differ

http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-parsers/src/test/resources/test-documents/testWORD_macros.doc
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testWORD_macros.doc b/tika-parsers/src/test/resources/test-documents/testWORD_macros.doc
new file mode 100644
index 0000000..838d86b
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testWORD_macros.doc differ

http://git-wip-us.apache.org/repos/asf/tika/blob/2ae7206d/tika-parsers/src/test/resources/test-documents/testWORD_macros.docm
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testWORD_macros.docm b/tika-parsers/src/test/resources/test-documents/testWORD_macros.docm
new file mode 100644
index 0000000..a915310
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testWORD_macros.docm differ