You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/09/22 12:59:59 UTC
tika git commit: TIKA-2069 -- extract macros from MSOffice docs,
fix tests to find target metadata object in any order
Repository: tika
Updated Branches:
refs/heads/master 2ae7206d9 -> 8a45f67a2
TIKA-2069 -- extract macros from MSOffice docs, fix tests to find target metadata object in any order
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/8a45f67a
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/8a45f67a
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/8a45f67a
Branch: refs/heads/master
Commit: 8a45f67a2e3641b08fcfb5e2283e4a43ff86f3cd
Parents: 2ae7206
Author: tballison <ta...@mitre.org>
Authored: Thu Sep 22 08:59:53 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Thu Sep 22 08:59:53 2016 -0400
----------------------------------------------------------------------
.../src/test/java/org/apache/tika/TikaTest.java | 38 +++++++++++++++++
.../tika/parser/microsoft/ExcelParserTest.java | 17 ++++----
.../parser/microsoft/PowerPointParserTest.java | 18 +++++---
.../tika/parser/microsoft/WordParserTest.java | 15 ++++---
.../parser/microsoft/ooxml/OOXMLParserTest.java | 45 +++++++++++---------
5 files changed, 91 insertions(+), 42 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/8a45f67a/tika-core/src/test/java/org/apache/tika/TikaTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index a699ac8..690db33 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -99,6 +99,44 @@ public abstract class TikaTest {
assertFalse(needle + " unexpectedly found in:\n" + haystack, haystack.contains(needle));
}
+ /**
+ * Test that in at least one item in metadataList, all keys and values
+ * in minExpected are contained.
+ * <p>
+ * The values in minExpected are tested for whether they are contained
+ * within a value in the target. If minExpected=&dquot;text/vbasic&dquot; and
+ * what was actually found in the target within metadatalist is
+ * &dquot;text/vbasic; charset=windows-1252&dquot;,
+ * that is counted as a hit.
+ *
+ * @param minExpected
+ * @param metadataList
+ */
+ public static void assertContainsAtLeast(Metadata minExpected, List<Metadata> metadataList) {
+
+ for (Metadata m : metadataList) {
+ int foundPropertyCount = 0;
+ for (String n : minExpected.names()) {
+ int foundValCount = 0;
+ for (String foundVal : m.getValues(n)) {
+ for (String expectedVal : minExpected.getValues(n)) {
+ if (foundVal.contains(expectedVal)) {
+ foundValCount++;
+ }
+ }
+ }
+ if (foundValCount == minExpected.getValues(n).length) {
+ foundPropertyCount++;
+ }
+ }
+ if (foundPropertyCount == minExpected.names().length) {
+ //found everything!
+ return;
+ }
+ }
+ //TODO: figure out how to have more informative error message
+ fail("Couldn't find everything within a single metadata item");
+ }
protected static class XMLResult {
public final String xml;
public final Metadata metadata;
http://git-wip-us.apache.org/repos/asf/tika/blob/8a45f67a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index eb1a814..db137e0 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -474,14 +474,15 @@ public class ExcelParserTest extends TikaTest {
}
@Test
- public void testMacroinXls() throws Exception {
- List<Metadata> metadataList = getRecursiveMetadata("testEXCEL_macro.xls");
- Metadata macroMetadata = metadataList.get(1);
- assertContains("Sub Dirty()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
- assertContains("dirty dirt dirt", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
- assertContains("text/x-vbasic", macroMetadata.get(Metadata.CONTENT_TYPE));
- assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(),
- macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+ public void testMacros() throws Exception {
+ Metadata minExpected = new Metadata();
+ minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Dirty()");
+ minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "dirty dirt dirt");
+ minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
+ minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+ TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
+
+ assertContainsAtLeast(minExpected, getRecursiveMetadata("testEXCEL_macro.xls"));
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/8a45f67a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
index 41400c5..41c5077 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
@@ -254,12 +254,18 @@ public class PowerPointParserTest extends TikaTest {
@Test
@Ignore("POI 3.15-final not finding any macros in this ppt")
public void testMacros() throws Exception {
+ Metadata minExpected = new Metadata();
+ minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
+ minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
+ minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
+ minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+ TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
+
List<Metadata> metadataList = getRecursiveMetadata("testPPT_macros.ppt");
- Metadata macroMetadata = metadataList.get(1);
- assertContains("Sub Embolden()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
- assertContains("Sub Italicize()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
- assertContains("text/x-vbasic", macroMetadata.get(Metadata.CONTENT_TYPE));
- assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(),
- macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+ assertContainsAtLeast(minExpected, metadataList);
}
+
+
+
+
}
http://git-wip-us.apache.org/repos/asf/tika/blob/8a45f67a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index e63a61b..bfb7ca1 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -524,14 +524,15 @@ public class WordParserTest extends TikaTest {
@Test
public void testMacros() throws Exception {
- //debug(getRecursiveMetadata("SimpleMacro.doc"));
+ Metadata minExpected = new Metadata();
+ minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
+ minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
+ minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
+ minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+ TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
+
List<Metadata> metadataList = getRecursiveMetadata("testWORD_macros.doc");
- Metadata macroMetadata = metadataList.get(1);
- assertContains("Sub Embolden()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
- assertContains("Sub Italicize()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
- assertContains("text/x-vbasic", macroMetadata.get(Metadata.CONTENT_TYPE));
- assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(),
- macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+ assertContainsAtLeast(minExpected, metadataList);
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/8a45f67a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index ccfb293..5e0fc1e 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -1265,35 +1265,38 @@ public class OOXMLParserTest extends TikaTest {
@Test
public void testMacrosInDocm() throws Exception {
- List<Metadata> metadataList = getRecursiveMetadata("testWORD_macros.docm");
- Metadata macroMetadata = metadataList.get(1);
- assertContains("Sub Embolden()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
- assertContains("Sub Italicize()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
- assertContains("text/x-vbasic", macroMetadata.get(Metadata.CONTENT_TYPE));
- assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(),
- macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+ Metadata minExpected = new Metadata();
+ minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
+ minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
+ minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
+ minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+ TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
+
+ assertContainsAtLeast(minExpected, getRecursiveMetadata("testWORD_macros.docm"));
}
@Test
public void testMacrosInPptm() throws Exception {
- List<Metadata> metadataList = getRecursiveMetadata("testPPT_macros.pptm");
- Metadata macroMetadata = metadataList.get(1);
- assertContains("Sub Embolden()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
- assertContains("Sub Italicize()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
- assertContains("text/x-vbasic", macroMetadata.get(Metadata.CONTENT_TYPE));
- assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(),
- macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+ Metadata minExpected = new Metadata();
+ minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
+ minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
+ minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
+ minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+ TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
+
+ assertContainsAtLeast(minExpected, getRecursiveMetadata("testPPT_macros.pptm"));
}
@Test
public void testMacroinXlsm() throws Exception {
- List<Metadata> metadataList = getRecursiveMetadata("testEXCEL_macro.xlsm");
- Metadata macroMetadata = metadataList.get(1);
- assertContains("Sub Dirty()", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
- assertContains("dirty dirt dirt", macroMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
- assertContains("text/x-vbasic", macroMetadata.get(Metadata.CONTENT_TYPE));
- assertEquals(TikaCoreProperties.EmbeddedResourceType.MACRO.toString(),
- macroMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+ Metadata minExpected = new Metadata();
+ minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Dirty()");
+ minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "dirty dirt dirt");
+ minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
+ minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+ TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
+
+ assertContainsAtLeast(minExpected, getRecursiveMetadata("testEXCEL_macro.xlsm"));
}
}