You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/12/06 14:06:45 UTC

[2/7] tika git commit: TIKA-2191 -- step2 -- add handling for docm files...extract macros

TIKA-2191 -- step2 -- add handling for docm files...extract macros


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/f93d4e1f
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/f93d4e1f
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/f93d4e1f

Branch: refs/heads/master
Commit: f93d4e1fffdb4a441f7fa750a43691adfa70c392
Parents: 8943013
Author: tballison <ta...@mitre.org>
Authored: Mon Dec 5 11:14:34 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Tue Dec 6 09:01:00 2016 -0500

----------------------------------------------------------------------
 .../ooxml/SXWPFWordExtractorDecorator.java      | 15 +++++--
 .../microsoft/ooxml/SXWPFExtractorTest.java     | 42 +++++++++++---------
 2 files changed, 35 insertions(+), 22 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/f93d4e1f/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
index e08dab1..ee88f15 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
@@ -72,7 +72,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
     protected void buildXHTML(XHTMLContentHandler xhtml)
             throws SAXException, XmlException, IOException {
         //handle main document
-        List<PackagePart> pps = opcPackage.getPartsByContentType(XWPFRelation.DOCUMENT.getContentType());
+        List<PackagePart> pps = getMainDocumentParts();
         if (pps != null) {
             for (PackagePart pp : pps) {
                 //likely only one, but why not...
@@ -81,7 +81,6 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
         }
         //handle glossary document
         pps = opcPackage.getPartsByContentType(XWPFRelation.GLOSSARY_DOCUMENT.getContentType());
-
         if (pps != null) {
             for (PackagePart pp : pps) {
                 //likely only one, but why not...
@@ -145,7 +144,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
                                     new XWPFTikaBodyPartHandler(xhtml, xwpfListManager,
                                             context.get(OfficeParserConfig.class)), hyperlinks))));
         } catch (TikaException e) {
-            e.printStackTrace();
+            //swallow
         }
 
     }
@@ -217,6 +216,14 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
      */
     @Override
     protected List<PackagePart> getMainDocumentParts() {
-        return opcPackage.getPartsByContentType(XWPFRelation.DOCUMENT.getContentType());
+        //figure out which one this is
+        List<PackagePart> pps = opcPackage.getPartsByContentType(XWPFRelation.DOCUMENT.getContentType());
+        if (pps.size() == 0) {
+            pps = opcPackage.getPartsByContentType(XWPFRelation.MACRO_DOCUMENT.getContentType());
+            if (pps.size() == 0) {
+                pps = opcPackage.getPartsByContentType(XWPFRelation.MACRO_TEMPLATE_DOCUMENT.getContentType());
+            }
+        }
+        return pps;
     }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/f93d4e1f/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
index fb7a977..dffa112 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
@@ -148,9 +148,6 @@ public class SXWPFExtractorTest extends TikaTest {
         assertContains("<p>The <i>quick</i> brown <b>fox </b>j<i>um</i><b><i>ped</i></b> over",
                 content);
 
-        //TODO: add chart parsing
-//        assertContains("This is the chart", content);
-
         assertContains("This is a comment", content);
 
         assertContains("This is an endnote", content);
@@ -177,6 +174,9 @@ public class SXWPFExtractorTest extends TikaTest {
         //TODO: extract chart text
 //        assertContains("This is the chart title", content);
 
+        //TODO: add chart parsing
+//        assertContains("This is the chart", content);
+
     }
 
     /**
@@ -261,15 +261,18 @@ public class SXWPFExtractorTest extends TikaTest {
 
         // Text too
         assertTrue(xml.contains("<p>The end!</p>"));
+    }
 
+    @Test
+    public void testContiguousHTMLFormatting() throws Exception {
         // TIKA-692: test document containing multiple
         // character runs within a bold tag:
-        xml = getXML("testWORD_bold_character_runs.docx", parseContext).xml;
+        String xml = getXML("testWORD_bold_character_runs.docx", parseContext).xml;
 
         // Make sure bold text arrived as single
         // contiguous string even though Word parser
         // handled this as 3 character runs
-//TODO:        assertTrue("Bold text wasn't contiguous: " + xml, xml.contains("F<b>oob</b>a<b>r</b>"));
+        assertTrue("Bold text wasn't contiguous: " + xml, xml.contains("F<b>oob</b>a<b>r</b>"));
 
         // TIKA-692: test document containing multiple
         // character runs within a bold tag:
@@ -278,7 +281,7 @@ public class SXWPFExtractorTest extends TikaTest {
         // Make sure bold text arrived as single
         // contiguous string even though Word parser
         // handled this as 3 character runs
-//TODO:        assertTrue("Bold text wasn't contiguous: " + xml, xml.contains("F<b>oob</b>a<b>r</b>"));
+        assertTrue("Bold text wasn't contiguous: " + xml, xml.contains("F<b>oob</b>a<b>r</b>"));
     }
 
     /**
@@ -311,9 +314,9 @@ public class SXWPFExtractorTest extends TikaTest {
 
     @Test
     public void testVarious() throws Exception {
-        XMLResult xmlResult = getXML("testWORD_various.docx", parseContext);
-        String content = xmlResult.xml;
-        Metadata metadata = xmlResult.metadata;
+        Metadata metadata = new Metadata();
+        String content = getText(getResourceAsStream("/test-documents/testWORD_various.docx"),
+                new AutoDetectParser(), parseContext, metadata);
         //content = content.replaceAll("\\s+"," ");
         assertContains("Footnote appears here", content);
         assertContains("This is a footnote.", content);
@@ -328,8 +331,8 @@ public class SXWPFExtractorTest extends TikaTest {
         assertContains("Here is a citation:", content);
         assertContains("Figure 1 This is a caption for Figure 1", content);
         assertContains("(Kramer)", content);
-//TODO:        assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+", " "));
-//TODO:        assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+", " "));
+        assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+", " "));
+        assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+", " "));
         assertContains("This is a hyperlink", content);
         assertContains("Here is a list:", content);
         for (int row = 1; row <= 3; row++) {
@@ -522,7 +525,7 @@ public class SXWPFExtractorTest extends TikaTest {
         String xml = getXML("testDOCX_Thumbnail.docx", parseContext).xml;
         int a = xml.indexOf("This file contains a thumbnail");
         int b = xml.indexOf("<div class=\"embedded\" id=\"/docProps/thumbnail.emf\" />");
-
+        System.out.println(xml);
         assertTrue(a != -1);
         assertTrue(b != -1);
         assertTrue(a < b);
@@ -566,11 +569,11 @@ public class SXWPFExtractorTest extends TikaTest {
     }
 
     @Test
+    @Ignore("TODO -- paragraph list numbers")
     public void testDOCXParagraphNumbering() throws Exception {
         String xml = getXML("testWORD_numbered_list.docx", parseContext).xml;
-        //SAX parser is getting this.  DOM parser is not
+        //SAX parser is getting this.  DOM parser is not!
         assertContains("add a list here", xml);
-/*TODO:
         assertContains("1) This", xml);
         assertContains("a) Is", xml);
         assertContains("i) A multi", xml);
@@ -591,11 +594,11 @@ public class SXWPFExtractorTest extends TikaTest {
         assertContains("Some-1-CrazyFormat Greek numbering with crazy format - alpha", xml);
         assertContains("1.1.1. 1.1.1", xml);
         assertContains("1.1. 1.2-&gt;1.1  //set the value", xml);
-*/
+
     }
 
     @Test
-    @Ignore("TODO")
+    @Ignore("TODO -- paragraph list numbers")
     public void testDOCXOverrideParagraphNumbering() throws Exception {
         String xml = getXML("testWORD_override_list_numbering.docx").xml;
 
@@ -678,8 +681,11 @@ public class SXWPFExtractorTest extends TikaTest {
     }
 
     @Test
-    @Ignore("TODO")
     public void testMacrosInDocm() throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata("testWORD_macros.docm", parseContext);
+        //check that content came out of the .docm file
+        assertContains("quick", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT));
+
         Metadata minExpected = new Metadata();
         minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
         minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
@@ -687,7 +693,7 @@ public class SXWPFExtractorTest extends TikaTest {
         minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                 TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
 
-        assertContainsAtLeast(minExpected, getRecursiveMetadata("testWORD_macros.docm", parseContext));
+        assertContainsAtLeast(minExpected, metadataList);//, parseContext));
     }