You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/09/22 12:59:59 UTC

tika git commit: TIKA-2069 -- extract macros from MSOffice docs, fix tests to find target metadata object in any order

Repository: tika
Updated Branches:
  refs/heads/master 2ae7206d9 -> 8a45f67a2


TIKA-2069 -- extract macros from MSOffice docs, fix tests to find target metadata object in any order


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/8a45f67a
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/8a45f67a
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/8a45f67a

Branch: refs/heads/master
Commit: 8a45f67a2e3641b08fcfb5e2283e4a43ff86f3cd
Parents: 2ae7206
Author: tballison <ta...@mitre.org>
Authored: Thu Sep 22 08:59:53 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Thu Sep 22 08:59:53 2016 -0400

----------------------------------------------------------------------
 .../src/test/java/org/apache/tika/TikaTest.java | 38 +++++++++++++++++
 .../tika/parser/microsoft/ExcelParserTest.java  | 17 ++++----
 .../parser/microsoft/PowerPointParserTest.java  | 18 +++++---
 .../tika/parser/microsoft/WordParserTest.java   | 15 ++++---
 .../parser/microsoft/ooxml/OOXMLParserTest.java | 45 +++++++++++---------
 5 files changed, 91 insertions(+), 42 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/8a45f67a/tika-core/src/test/java/org/apache/tika/TikaTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index a699ac8..690db33 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -99,6 +99,44 @@ public abstract class TikaTest {
         assertFalse(needle + " unexpectedly found in:\n" + haystack, haystack.contains(needle));
     }
 
+    /**
+     * Test that in at least one item in metadataList, all keys and values
+     * in minExpected are contained.
+     * <p>
+     * The values in minExpected are tested for whether they are contained
+     * within a value in the target.  If minExpected=&dquot;text/vbasic&dquot;  and
+     * what was actually found in the target within metadatalist is
+     * &dquot;text/vbasic; charset=windows-1252&dquot;,
+     * that is counted as a hit.
+     *
+     * @param minExpected
+     * @param metadataList
+     */
+    public static void assertContainsAtLeast(Metadata minExpected, List<Metadata> metadataList) {
+
+        for (Metadata m : metadataList) {
+            int foundPropertyCount = 0;
+            for (String n : minExpected.names()) {
+                int foundValCount = 0;
+                for (String foundVal : m.getValues(n)) {
+                    for (String expectedVal : minExpected.getValues(n)) {
+                        if (foundVal.contains(expectedVal)) {
+                            foundValCount++;
+                        }
+                    }
+                }
+                if (foundValCount == minExpected.getValues(n).length) {
+                    foundPropertyCount++;
+                }
+            }
+            if (foundPropertyCount == minExpected.names().length) {
+                //found everything!
+                return;
+            }
+        }
+        //TODO: figure out how to have more informative error message
+        fail("Couldn't find everything within a single metadata item");
+    }
     protected static class XMLResult {
         public final String xml;
         public final Metadata metadata;

http://git-wip-us.apache.org/repos/asf/tika/blob/8a45f67a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index eb1a814..db137e0 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -474,14 +474,15 @@ public class ExcelParserTest extends TikaTest {
     }
 
     @Test
-    public void testMacroinXls() throws Exception {
-        List<Metadata> metadataList = getRecursiveMetadata("testEXCEL_macro.xls");
-        Metadata macroMetadata = metadataList.get(1);
-        assertContains("Sub Dirty()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
-        assertContains("dirty dirt dirt", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
-        assertContains("text/x-vbasic", macroMetadata.get(Metadata.CONTENT_TYPE));
-        assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(),
-                macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+    public void testMacros() throws  Exception {
+        Metadata minExpected = new Metadata();
+        minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Dirty()");
+        minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "dirty dirt dirt");
+        minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
+        minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+                TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
+
+        assertContainsAtLeast(minExpected, getRecursiveMetadata("testEXCEL_macro.xls"));
     }
 
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/8a45f67a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
index 41400c5..41c5077 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
@@ -254,12 +254,18 @@ public class PowerPointParserTest extends TikaTest {
     @Test
     @Ignore("POI 3.15-final not finding any macros in this ppt")
     public void testMacros() throws  Exception {
+        Metadata minExpected = new Metadata();
+        minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
+        minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
+        minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
+        minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+                TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
+
         List<Metadata> metadataList = getRecursiveMetadata("testPPT_macros.ppt");
-        Metadata macroMetadata = metadataList.get(1);
-        assertContains("Sub Embolden()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
-        assertContains("Sub Italicize()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
-        assertContains("text/x-vbasic", macroMetadata.get(Metadata.CONTENT_TYPE));
-        assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(),
-                macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+        assertContainsAtLeast(minExpected, metadataList);
     }
+
+
+
+
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/8a45f67a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index e63a61b..bfb7ca1 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -524,14 +524,15 @@ public class WordParserTest extends TikaTest {
 
     @Test
     public void testMacros() throws  Exception {
-        //debug(getRecursiveMetadata("SimpleMacro.doc"));
+        Metadata minExpected = new Metadata();
+        minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
+        minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
+        minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
+        minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+                TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
+
         List<Metadata> metadataList = getRecursiveMetadata("testWORD_macros.doc");
-        Metadata macroMetadata = metadataList.get(1);
-        assertContains("Sub Embolden()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
-        assertContains("Sub Italicize()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
-        assertContains("text/x-vbasic", macroMetadata.get(Metadata.CONTENT_TYPE));
-        assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(),
-                macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+        assertContainsAtLeast(minExpected, metadataList);
     }
 }
 

http://git-wip-us.apache.org/repos/asf/tika/blob/8a45f67a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index ccfb293..5e0fc1e 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -1265,35 +1265,38 @@ public class OOXMLParserTest extends TikaTest {
 
     @Test
     public void testMacrosInDocm() throws Exception {
-        List<Metadata> metadataList = getRecursiveMetadata("testWORD_macros.docm");
-        Metadata macroMetadata = metadataList.get(1);
-        assertContains("Sub Embolden()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
-        assertContains("Sub Italicize()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
-        assertContains("text/x-vbasic", macroMetadata.get(Metadata.CONTENT_TYPE));
-        assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(),
-                macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+        Metadata minExpected = new Metadata();
+        minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
+        minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
+        minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
+        minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+                TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
+
+        assertContainsAtLeast(minExpected, getRecursiveMetadata("testWORD_macros.docm"));
     }
 
     @Test
     public void testMacrosInPptm() throws Exception {
-        List<Metadata> metadataList = getRecursiveMetadata("testPPT_macros.pptm");
-        Metadata macroMetadata = metadataList.get(1);
-        assertContains("Sub Embolden()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
-        assertContains("Sub Italicize()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
-        assertContains("text/x-vbasic", macroMetadata.get(Metadata.CONTENT_TYPE));
-        assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(),
-                macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+        Metadata minExpected = new Metadata();
+        minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
+        minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
+        minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
+        minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+                TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
+
+        assertContainsAtLeast(minExpected, getRecursiveMetadata("testPPT_macros.pptm"));
     }
 
     @Test
     public void testMacroinXlsm() throws Exception {
-        List<Metadata> metadataList = getRecursiveMetadata("testEXCEL_macro.xlsm");
-        Metadata macroMetadata = metadataList.get(1);
-        assertContains("Sub Dirty()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
-        assertContains("dirty dirt dirt", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
-        assertContains("text/x-vbasic", macroMetadata.get(Metadata.CONTENT_TYPE));
-        assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(),
-                macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+        Metadata minExpected = new Metadata();
+        minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Dirty()");
+        minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "dirty dirt dirt");
+        minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
+        minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+                TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
+
+        assertContainsAtLeast(minExpected, getRecursiveMetadata("testEXCEL_macro.xlsm"));
     }
 
 }