You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/12/06 14:06:45 UTC
[2/7] tika git commit: TIKA-2191 -- step2 -- add handling for docm
files...extract macros
TIKA-2191 -- step2 -- add handling for docm files...extract macros
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/f93d4e1f
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/f93d4e1f
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/f93d4e1f
Branch: refs/heads/master
Commit: f93d4e1fffdb4a441f7fa750a43691adfa70c392
Parents: 8943013
Author: tballison <ta...@mitre.org>
Authored: Mon Dec 5 11:14:34 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Tue Dec 6 09:01:00 2016 -0500
----------------------------------------------------------------------
.../ooxml/SXWPFWordExtractorDecorator.java | 15 +++++--
.../microsoft/ooxml/SXWPFExtractorTest.java | 42 +++++++++++---------
2 files changed, 35 insertions(+), 22 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/f93d4e1f/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
index e08dab1..ee88f15 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
@@ -72,7 +72,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
protected void buildXHTML(XHTMLContentHandler xhtml)
throws SAXException, XmlException, IOException {
//handle main document
- List<PackagePart> pps = opcPackage.getPartsByContentType(XWPFRelation.DOCUMENT.getContentType());
+ List<PackagePart> pps = getMainDocumentParts();
if (pps != null) {
for (PackagePart pp : pps) {
//likely only one, but why not...
@@ -81,7 +81,6 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
}
//handle glossary document
pps = opcPackage.getPartsByContentType(XWPFRelation.GLOSSARY_DOCUMENT.getContentType());
-
if (pps != null) {
for (PackagePart pp : pps) {
//likely only one, but why not...
@@ -145,7 +144,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
new XWPFTikaBodyPartHandler(xhtml, xwpfListManager,
context.get(OfficeParserConfig.class)), hyperlinks))));
} catch (TikaException e) {
- e.printStackTrace();
+ //swallow
}
}
@@ -217,6 +216,14 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
*/
@Override
protected List<PackagePart> getMainDocumentParts() {
- return opcPackage.getPartsByContentType(XWPFRelation.DOCUMENT.getContentType());
+ //figure out which one this is
+ List<PackagePart> pps = opcPackage.getPartsByContentType(XWPFRelation.DOCUMENT.getContentType());
+ if (pps.size() == 0) {
+ pps = opcPackage.getPartsByContentType(XWPFRelation.MACRO_DOCUMENT.getContentType());
+ if (pps.size() == 0) {
+ pps = opcPackage.getPartsByContentType(XWPFRelation.MACRO_TEMPLATE_DOCUMENT.getContentType());
+ }
+ }
+ return pps;
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/f93d4e1f/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
index fb7a977..dffa112 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
@@ -148,9 +148,6 @@ public class SXWPFExtractorTest extends TikaTest {
assertContains("<p>The <i>quick</i> brown <b>fox </b>j<i>um</i><b><i>ped</i></b> over",
content);
- //TODO: add chart parsing
-// assertContains("This is the chart", content);
-
assertContains("This is a comment", content);
assertContains("This is an endnote", content);
@@ -177,6 +174,9 @@ public class SXWPFExtractorTest extends TikaTest {
//TODO: extract chart text
// assertContains("This is the chart title", content);
+ //TODO: add chart parsing
+// assertContains("This is the chart", content);
+
}
/**
@@ -261,15 +261,18 @@ public class SXWPFExtractorTest extends TikaTest {
// Text too
assertTrue(xml.contains("<p>The end!</p>"));
+ }
+ @Test
+ public void testContiguousHTMLFormatting() throws Exception {
// TIKA-692: test document containing multiple
// character runs within a bold tag:
- xml = getXML("testWORD_bold_character_runs.docx", parseContext).xml;
+ String xml = getXML("testWORD_bold_character_runs.docx", parseContext).xml;
// Make sure bold text arrived as single
// contiguous string even though Word parser
// handled this as 3 character runs
-//TODO: assertTrue("Bold text wasn't contiguous: " + xml, xml.contains("F<b>oob</b>a<b>r</b>"));
+ assertTrue("Bold text wasn't contiguous: " + xml, xml.contains("F<b>oob</b>a<b>r</b>"));
// TIKA-692: test document containing multiple
// character runs within a bold tag:
@@ -278,7 +281,7 @@ public class SXWPFExtractorTest extends TikaTest {
// Make sure bold text arrived as single
// contiguous string even though Word parser
// handled this as 3 character runs
-//TODO: assertTrue("Bold text wasn't contiguous: " + xml, xml.contains("F<b>oob</b>a<b>r</b>"));
+ assertTrue("Bold text wasn't contiguous: " + xml, xml.contains("F<b>oob</b>a<b>r</b>"));
}
/**
@@ -311,9 +314,9 @@ public class SXWPFExtractorTest extends TikaTest {
@Test
public void testVarious() throws Exception {
- XMLResult xmlResult = getXML("testWORD_various.docx", parseContext);
- String content = xmlResult.xml;
- Metadata metadata = xmlResult.metadata;
+ Metadata metadata = new Metadata();
+ String content = getText(getResourceAsStream("/test-documents/testWORD_various.docx"),
+ new AutoDetectParser(), parseContext, metadata);
//content = content.replaceAll("\\s+"," ");
assertContains("Footnote appears here", content);
assertContains("This is a footnote.", content);
@@ -328,8 +331,8 @@ public class SXWPFExtractorTest extends TikaTest {
assertContains("Here is a citation:", content);
assertContains("Figure 1 This is a caption for Figure 1", content);
assertContains("(Kramer)", content);
-//TODO: assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+", " "));
-//TODO: assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+", " "));
+ assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+", " "));
+ assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+", " "));
assertContains("This is a hyperlink", content);
assertContains("Here is a list:", content);
for (int row = 1; row <= 3; row++) {
@@ -522,7 +525,7 @@ public class SXWPFExtractorTest extends TikaTest {
String xml = getXML("testDOCX_Thumbnail.docx", parseContext).xml;
int a = xml.indexOf("This file contains a thumbnail");
int b = xml.indexOf("<div class=\"embedded\" id=\"/docProps/thumbnail.emf\" />");
-
+ System.out.println(xml);
assertTrue(a != -1);
assertTrue(b != -1);
assertTrue(a < b);
@@ -566,11 +569,11 @@ public class SXWPFExtractorTest extends TikaTest {
}
@Test
+ @Ignore("TODO -- paragraph list numbers")
public void testDOCXParagraphNumbering() throws Exception {
String xml = getXML("testWORD_numbered_list.docx", parseContext).xml;
- //SAX parser is getting this. DOM parser is not
+ //SAX parser is getting this. DOM parser is not!
assertContains("add a list here", xml);
-/*TODO:
assertContains("1) This", xml);
assertContains("a) Is", xml);
assertContains("i) A multi", xml);
@@ -591,11 +594,11 @@ public class SXWPFExtractorTest extends TikaTest {
assertContains("Some-1-CrazyFormat Greek numbering with crazy format - alpha", xml);
assertContains("1.1.1. 1.1.1", xml);
assertContains("1.1. 1.2->1.1 //set the value", xml);
-*/
+
}
@Test
- @Ignore("TODO")
+ @Ignore("TODO -- paragraph list numbers")
public void testDOCXOverrideParagraphNumbering() throws Exception {
String xml = getXML("testWORD_override_list_numbering.docx").xml;
@@ -678,8 +681,11 @@ public class SXWPFExtractorTest extends TikaTest {
}
@Test
- @Ignore("TODO")
public void testMacrosInDocm() throws Exception {
+ List<Metadata> metadataList = getRecursiveMetadata("testWORD_macros.docm", parseContext);
+ //check that content came out of the .docm file
+ assertContains("quick", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT));
+
Metadata minExpected = new Metadata();
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
@@ -687,7 +693,7 @@ public class SXWPFExtractorTest extends TikaTest {
minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
- assertContainsAtLeast(minExpected, getRecursiveMetadata("testWORD_macros.docm", parseContext));
+ assertContainsAtLeast(minExpected, metadataList);//, parseContext));
}