You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/12/06 14:06:44 UTC

[1/7] tika git commit: TIKA-2191 -- step1 -- add other docx tests and comment/ignore where appropriate

Repository: tika
Updated Branches:
  refs/heads/master 99b592437 -> 5425d02a1


TIKA-2191 -- step1 -- add other docx tests and comment/ignore where appropriate


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/89430130
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/89430130
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/89430130

Branch: refs/heads/master
Commit: 894301307da5167c95585688f9448d3050f53aaa
Parents: 99b5924
Author: tballison <ta...@mitre.org>
Authored: Mon Dec 5 10:10:37 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Tue Dec 6 09:00:37 2016 -0500

----------------------------------------------------------------------
 .../parser/microsoft/ooxml/OOXMLParserTest.java |  16 +
 .../microsoft/ooxml/SXWPFExtractorTest.java     | 694 +++++++++++++++++++
 .../ooxml/xwpf/SXWPFExtractorTest.java          | 187 -----
 .../parser/microsoft/tika-config-sax-docx.xml   |  27 +
 4 files changed, 737 insertions(+), 187 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/89430130/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index bfbd8ce..0059d09 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -18,6 +18,7 @@ package org.apache.tika.parser.microsoft.ooxml;
 
 import static java.nio.charset.StandardCharsets.UTF_8;
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertTrue;
 
 import javax.xml.transform.OutputKeys;
@@ -37,6 +38,7 @@ import java.util.Locale;
 import java.util.Map;
 
 import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
 import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
@@ -1330,6 +1332,20 @@ public class OOXMLParserTest extends TikaTest {
         System.out.println("elapsed: "+(new Date().getTime()-started) + " with " + ex + " exceptions");
     }
 
+    @Test
+    public void testInitializationViaConfig() throws Exception {
+        //NOTE: this test relies on a bug in the DOM extractor that
+        //is passing over the title information.
+        //once we fix that, this test will no longer be meaningful!
+        InputStream is = getClass().getResourceAsStream("/org/apache/tika/parser/microsoft/tika-config-sax-docx.xml");
+        assertNotNull(is);
+        TikaConfig tikaConfig = new TikaConfig(is);
+        AutoDetectParser p = new AutoDetectParser(tikaConfig);
+        XMLResult xml = getXML("testWORD_2006ml.docx", p, new Metadata());
+        assertContains("engaging title", xml.xml);
+
+    }
+
 }
 
 

http://git-wip-us.apache.org/repos/asf/tika/blob/89430130/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
new file mode 100644
index 0000000..fb7a977
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
@@ -0,0 +1,694 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.ByteArrayOutputStream;
+import java.io.InputStream;
+import java.io.PrintStream;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.parser.microsoft.OfficeParserConfig;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Before;
+import org.junit.Ignore;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+
+public class SXWPFExtractorTest extends TikaTest {
+
+    private ParseContext parseContext;
+
+    @Before
+    public void setUp() {
+        parseContext = new ParseContext();
+        OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+        officeParserConfig.setUseSAXDocxExtractor(true);
+        parseContext.set(OfficeParserConfig.class, officeParserConfig);
+
+    }
+
+    @Test
+    public void basicTest() throws Exception {
+
+        List<Metadata> metadataList = getRecursiveMetadata("testWORD_2006ml.docx", parseContext);
+
+        assertEquals(8, metadataList.size());
+        Metadata m = metadataList.get(0);
+
+        assertEquals("2016-11-29T00:58:00Z", m.get(TikaCoreProperties.CREATED));
+        assertEquals("2016-11-29T17:54:00Z", m.get(TikaCoreProperties.MODIFIED));
+        assertEquals("My Document Title", m.get(TikaCoreProperties.TITLE));
+        assertEquals("This is the Author", m.get(TikaCoreProperties.CREATOR));
+        assertEquals("3", m.get(OfficeOpenXMLCore.REVISION));
+        assertEquals("Allison, Timothy B.", m.get(TikaCoreProperties.MODIFIER));
+        //assertEquals("0", m.get(OfficeOpenXMLExtended.DOC_SECURITY));
+        assertEquals("260", m.get(Office.WORD_COUNT));
+        assertEquals("3", m.get(Office.PARAGRAPH_COUNT));
+        assertEquals("1742", m.get(Office.CHARACTER_COUNT_WITH_SPACES));
+        assertEquals("12", m.get(Office.LINE_COUNT));
+        assertEquals("16.0000", m.get(OfficeOpenXMLExtended.APP_VERSION));
+
+
+        String content = m.get(RecursiveParserWrapper.TIKA_CONTENT);
+
+        assertContainsCount("engaging title page", content, 1);
+        //need \n to differentiate from metadata values
+        assertContainsCount("This is the Author\n", content, 1);
+        assertContainsCount("This is an engaging title page", content, 1);
+
+        assertContains("My Document Title", content);
+        assertContains("My Document Subtitle", content);
+
+        assertContains("<p>\tHeading1\t3</p>", content);
+
+
+        //TODO: integrate numbering
+        assertContains("Really basic 2.", content);
+
+        assertContainsCount("This is a text box", content, 1);
+
+        assertContains("<p>This is a hyperlink: <a href=\"http://tika.apache.org\">tika</a></p>", content);
+
+        assertContains("<p>This is a link to a local file: <a href=\"file:///C:/data/test.png\">test.png</a></p>", content);
+
+        assertContains("<p>This is          10 spaces</p>", content);
+
+        //caption
+        assertContains("<p>Table 1: Table1 Caption</p>", content);
+
+        //embedded table
+        //TODO: figure out how to handle embedded tables in html
+        assertContains("<p>Embedded table r1c1</p>", content);
+
+        //shape
+        assertContainsCount("<p>This is text within a shape", content, 1);
+
+        //sdt rich text
+        assertContains("<p>Rich text content control", content);
+
+        //sdt simple text
+        assertContains("<p>Simple text content control", content);
+
+        //sdt repeating
+        assertContains("Repeating content", content);
+
+        //sdt dropdown
+        //TODO: get options for dropdown
+        assertContains("Drop down1", content);
+
+        //sdt date
+        assertContains("<p>11/16/2016</p>", content);
+
+        //test that <tab/> works
+        assertContains("tab\ttab", content);
+
+        assertContainsCount("serious word art", content, 1);
+        assertContainsCount("Wordartr1c1", content, 1);
+
+        //glossary document contents
+        assertContains("Click or tap to enter a date", content);
+
+        //basic formatting
+        assertContains("<p>The <i>quick</i> brown <b>fox </b>j<i>um</i><b><i>ped</i></b> over",
+                content);
+
+        //TODO: add chart parsing
+//        assertContains("This is the chart", content);
+
+        assertContains("This is a comment", content);
+
+        assertContains("This is an endnote", content);
+
+        assertContains("this is the footnote", content);
+
+        assertContains("First page header", content);
+
+        assertContains("Even page header", content);
+
+        assertContains("Odd page header", content);
+
+        assertContains("First page footer", content);
+
+        assertContains("Even page footer", content);
+
+        assertContains("Odd page footer", content);
+
+        //test default does not include deleted
+        assertNotContained("frog", content);
+
+        assertContains("Mattmann", content);
+
+        //TODO: extract chart text
+//        assertContains("This is the chart title", content);
+
+    }
+
+    /**
+     * Test the plain text output of the Word converter
+     *
+     * @throws Exception
+     */
+    @Test
+    public void testWord() throws Exception {
+
+        XMLResult xmlResult = getXML("testWORD.docx", parseContext);
+            assertEquals(
+                    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+                    xmlResult.metadata.get(Metadata.CONTENT_TYPE));
+            assertEquals("Sample Word Document", xmlResult.metadata.get(TikaCoreProperties.TITLE));
+            assertEquals("Keith Bennett", xmlResult.metadata.get(TikaCoreProperties.CREATOR));
+            assertEquals("Keith Bennett", xmlResult.metadata.get(Metadata.AUTHOR));
+            assertTrue(xmlResult.xml.contains("Sample Word Document"));
+
+    }
+
+    /**
+     * Test the plain text output of the Word converter
+     *
+     * @throws Exception
+     */
+    @Test
+    public void testWordFootnote() throws Exception {
+        XMLResult xmlResult = getXML("footnotes.docx", parseContext);
+            assertEquals(
+                    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+                    xmlResult.metadata.get(Metadata.CONTENT_TYPE));
+            assertTrue(xmlResult.xml.contains("snoska"));
+
+    }
+
+    /**
+     * Test that the word converter is able to generate the
+     * correct HTML for the document
+     */
+    @Test
+    public void testWordHTML() throws Exception {
+        XMLResult result = getXML("testWORD.docx", parseContext);
+        String xml = result.xml;
+        Metadata metadata = result.metadata;
+        assertEquals(
+                "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+                metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
+        assertTrue(xml.contains("Sample Word Document"));
+
+        // Check that custom headings came through
+//TODO:        assertTrue(xml.contains("<h1 class=\"title\">"));
+
+        // Regular headings
+//TODO:        assertTrue(xml.contains("<h1>Heading Level 1</h1>"));
+//TODO:        assertTrue(xml.contains("<h2>Heading Level 2</h2>"));
+        // Headings with anchor tags in them
+//TODO:        assertTrue(xml.contains("<h3><a name=\"OnLevel3\" />Heading Level 3</h3>"));
+        // Bold and italic
+        assertTrue(xml.contains("<b>BOLD</b>"));
+        assertTrue(xml.contains("<i>ITALIC</i>"));
+        // Table
+        assertTrue(xml.contains("<table>"));
+        assertTrue(xml.contains("<td>"));
+        // Links
+        assertTrue(xml.contains("<a href=\"http://tika.apache.org/\">Tika</a>"));
+        // Anchor links
+//TODO:        assertContains("<a href=\"#OnMainHeading\">The Main Heading Bookmark</a>", xml);
+        // Paragraphs with other styles
+//TODO:        assertTrue(xml.contains("<p class=\"signature\">This one"));
+
+        result = getXML("testWORD_3imgs.docx", parseContext);
+        xml = result.xml;
+
+        // Images 2-4 (there is no 1!)
+//TODO:        assertTrue("Image not found in:\n" + xml, xml.contains("<img src=\"embedded:image2.png\" alt=\"A description...\" />"));
+//TODO:        assertTrue("Image not found in:\n" + xml, xml.contains("<img src=\"embedded:image3.jpeg\" alt=\"A description...\" />"));
+//TODO:        assertTrue("Image not found in:\n" + xml, xml.contains("<img src=\"embedded:image4.png\" alt=\"A description...\" />"));
+
+        // Text too
+        assertTrue(xml.contains("<p>The end!</p>"));
+
+        // TIKA-692: test document containing multiple
+        // character runs within a bold tag:
+        xml = getXML("testWORD_bold_character_runs.docx", parseContext).xml;
+
+        // Make sure bold text arrived as single
+        // contiguous string even though Word parser
+        // handled this as 3 character runs
+//TODO:        assertTrue("Bold text wasn't contiguous: " + xml, xml.contains("F<b>oob</b>a<b>r</b>"));
+
+        // TIKA-692: test document containing multiple
+        // character runs within a bold tag:
+        xml = getXML("testWORD_bold_character_runs2.docx", parseContext).xml;
+
+        // Make sure bold text arrived as single
+        // contiguous string even though Word parser
+        // handled this as 3 character runs
+//TODO:        assertTrue("Bold text wasn't contiguous: " + xml, xml.contains("F<b>oob</b>a<b>r</b>"));
+    }
+
+    /**
+     * Test that we can extract image from docx header
+     */
+    @Test
+    @Ignore("TODO")
+    public void testWordPicturesInHeader() throws Exception {
+        assertEquals(2, getRecursiveMetadata("headerPic.docx").size());
+        XMLResult xmlResult = getXML("headerPic.docx",  parseContext);
+            assertEquals(
+                    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+                    xmlResult.metadata.get(Metadata.CONTENT_TYPE));
+            // Check that custom headings came through
+            assertTrue(xmlResult.xml.contains("<img"));
+
+    }
+
+    /**
+     * Test docx without headers
+     * TIKA-633
+     */
+    @Test
+    public void testNullHeaders() throws Exception {
+        XMLResult xmlResult = getXML("NullHeader.docx", parseContext);
+        assertEquals("Should have found some text", false,
+                xmlResult.xml.isEmpty());
+
+    }
+
+    @Test
+    public void testVarious() throws Exception {
+        XMLResult xmlResult = getXML("testWORD_various.docx", parseContext);
+        String content = xmlResult.xml;
+        Metadata metadata = xmlResult.metadata;
+        //content = content.replaceAll("\\s+"," ");
+        assertContains("Footnote appears here", content);
+        assertContains("This is a footnote.", content);
+        assertContains("This is the header text.", content);
+        assertContains("This is the footer text.", content);
+        assertContains("Here is a text box", content);
+        assertContains("Bold", content);
+        assertContains("italic", content);
+        assertContains("underline", content);
+        assertContains("superscript", content);
+        assertContains("subscript", content);
+        assertContains("Here is a citation:", content);
+        assertContains("Figure 1 This is a caption for Figure 1", content);
+        assertContains("(Kramer)", content);
+//TODO:        assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+", " "));
+//TODO:        assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+", " "));
+        assertContains("This is a hyperlink", content);
+        assertContains("Here is a list:", content);
+        for (int row = 1; row <= 3; row++) {
+            //assertContains("�\tBullet " + row, content);
+            //assertContains("\u00b7\tBullet " + row, content);
+            assertContains("Bullet " + row, content);
+        }
+        assertContains("Here is a numbered list:", content);
+        for (int row = 1; row <= 3; row++) {
+            //assertContains(row + ")\tNumber bullet " + row, content);
+            //assertContains(row + ") Number bullet " + row, content);
+            // TODO: OOXMLExtractor fails to number the bullets:
+            assertContains("Number bullet " + row, content);
+        }
+
+        for (int row = 1; row <= 2; row++) {
+            for (int col = 1; col <= 3; col++) {
+                assertContains("Row " + row + " Col " + col, content);
+            }
+        }
+
+        assertContains("Keyword1 Keyword2", content);
+        assertEquals("Keyword1 Keyword2",
+                metadata.get(Metadata.KEYWORDS));
+
+        assertContains("Subject is here", content);
+        // TODO: Remove subject in Tika 2.0
+        assertEquals("Subject is here",
+                metadata.get(Metadata.SUBJECT));
+        assertEquals("Subject is here",
+                metadata.get(OfficeOpenXMLCore.SUBJECT));
+
+        assertContains("Suddenly some Japanese text:", content);
+        // Special version of (GHQ)
+        assertContains("\uff08\uff27\uff28\uff31\uff09", content);
+        // 6 other characters
+        assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f", content);
+
+        assertContains("And then some Gothic text:", content);
+        assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content);
+    }
+
+    @Test
+    public void testWordCustomProperties() throws Exception {
+        Metadata metadata = new Metadata();
+
+        try (InputStream input = OOXMLParserTest.class.getResourceAsStream(
+                "/test-documents/testWORD_custom_props.docx")) {
+            ContentHandler handler = new BodyContentHandler(-1);
+            ParseContext context = new ParseContext();
+            context.set(Locale.class, Locale.US);
+            new OOXMLParser().parse(input, handler, metadata, context);
+        }
+
+        assertEquals(
+                "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+                metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("EJ04325S", metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("Etienne Jouvin", metadata.get(TikaCoreProperties.MODIFIER));
+        assertEquals("Etienne Jouvin", metadata.get(Metadata.LAST_AUTHOR));
+        assertEquals("2011-07-29T16:52:00Z", metadata.get(TikaCoreProperties.CREATED));
+        assertEquals("2011-07-29T16:52:00Z", metadata.get(Metadata.CREATION_DATE));
+        assertEquals("2012-01-03T22:14:00Z", metadata.get(TikaCoreProperties.MODIFIED));
+        assertEquals("2012-01-03T22:14:00Z", metadata.get(Metadata.DATE));
+        assertEquals("Microsoft Office Word", metadata.get(Metadata.APPLICATION_NAME));
+        assertEquals("Microsoft Office Word", metadata.get(OfficeOpenXMLExtended.APPLICATION));
+        assertEquals("1", metadata.get(Office.PAGE_COUNT));
+        assertEquals("2", metadata.get(Office.WORD_COUNT));
+        assertEquals("My Title", metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("My Keyword", metadata.get(TikaCoreProperties.KEYWORDS));
+        assertEquals("Normal.dotm", metadata.get(Metadata.TEMPLATE));
+        assertEquals("Normal.dotm", metadata.get(OfficeOpenXMLExtended.TEMPLATE));
+        // TODO: Remove subject in Tika 2.0
+        assertEquals("My subject", metadata.get(Metadata.SUBJECT));
+        assertEquals("My subject", metadata.get(OfficeOpenXMLCore.SUBJECT));
+        assertEquals("EDF-DIT", metadata.get(TikaCoreProperties.PUBLISHER));
+        assertEquals("true", metadata.get("custom:myCustomBoolean"));
+        assertEquals("3", metadata.get("custom:myCustomNumber"));
+        assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
+        assertEquals("2010-12-30T23:00:00Z", metadata.get("custom:MyCustomDate"));
+        assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate"));
+    }
+
+    // TIKA-989:
+    @Test
+    @Ignore("TODO")
+    public void testEmbeddedPDF() throws Exception {
+        String xml = getXML("testWORD_embedded_pdf.docx", parseContext).xml;
+        int i = xml.indexOf("Here is the pdf file:");
+        int j = xml.indexOf("<div class=\"embedded\" id=\"rId5\"/>");
+        int k = xml.indexOf("Bye Bye");
+        int l = xml.indexOf("<div class=\"embedded\" id=\"rId6\"/>");
+        int m = xml.indexOf("Bye for real.");
+        assertTrue(i != -1);
+        assertTrue(j != -1);
+        assertTrue(k != -1);
+        assertTrue(l != -1);
+        assertTrue(m != -1);
+        assertTrue(i < j);
+        assertTrue(j < k);
+        assertTrue(k < l);
+        assertTrue(l < m);
+    }
+
+    // TIKA-1006
+    @Test
+    public void testWordNullStyle() throws Exception {
+        String xml = getXML("testWORD_null_style.docx").xml;
+        assertContains("Test av styrt dokument", xml);
+    }
+
+    /**
+     * TIKA-1044 - Handle word documents where parts of the
+     * text have no formatting or styles applied to them
+     */
+    @Test
+    public void testNoFormat() throws Exception {
+        assertContains("This is a piece of text that causes an exception",
+                getXML("testWORD_no_format.docx", parseContext).xml);
+    }
+
+    @Test
+    public void testSkipDeleted() throws Exception {
+        ParseContext pc = new ParseContext();
+        OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+        officeParserConfig.setIncludeDeletedContent(true);
+        officeParserConfig.setUseSAXDocxExtractor(true);
+        officeParserConfig.setIncludeMoveFromContent(true);
+        pc.set(OfficeParserConfig.class, officeParserConfig);
+
+        XMLResult r = getXML("testWORD_2006ml.docx", pc);
+        assertContains("frog", r.xml);
+        assertContainsCount("Second paragraph", r.xml, 2);
+
+    }
+
+    // TIKA-1005:
+    @Test
+    public void testTextInsideTextBox() throws Exception {
+        String xml = getXML("testWORD_text_box.docx", parseContext).xml;
+        assertContains("This text is directly in the body of the document.", xml);
+        assertContains("This text is inside of a text box in the body of the document.", xml);
+        assertContains("This text is inside of a text box in the header of the document.", xml);
+        assertContains("This text is inside of a text box in the footer of the document.", xml);
+    }
+
+    /**
+     * Test for missing text described in
+     * <a href="https://issues.apache.org/jira/browse/TIKA-1130">TIKA-1130</a>.
+     * and TIKA-1317
+     */
+    @Test
+    public void testMissingText() throws Exception {
+
+        XMLResult xmlResult = getXML("testWORD_missing_text.docx", parseContext);
+            assertEquals(
+                    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+                    xmlResult.metadata.get(Metadata.CONTENT_TYPE));
+            assertContains("BigCompany", xmlResult.xml);
+            assertContains("Seasoned", xmlResult.xml);
+            assertContains("Rich_text_in_cell", xmlResult.xml);
+
+    }
+
+    //TIKA-792; with room for future missing bean tests
+    @Test
+    public void testWordMissingOOXMLBeans() throws Exception {
+        //If a bean is missing, POI prints stack trace to stderr
+        String[] fileNames = new String[]{
+                "testWORD_missing_ooxml_bean1.docx",//TIKA-792
+        };
+        PrintStream origErr = System.err;
+        for (String fileName : fileNames) {
+
+            //grab stderr
+            ByteArrayOutputStream errContent = new ByteArrayOutputStream();
+            System.setErr(new PrintStream(errContent, true, UTF_8.name()));
+            getXML(fileName, parseContext);
+
+            //return stderr
+            System.setErr(origErr);
+
+            String err = errContent.toString(UTF_8.name());
+            assertTrue(err.length() == 0);
+        }
+    }
+
+    @Test
+    public void testDOCXThumbnail() throws Exception {
+        String xml = getXML("testDOCX_Thumbnail.docx", parseContext).xml;
+        int a = xml.indexOf("This file contains a thumbnail");
+        int b = xml.indexOf("<div class=\"embedded\" id=\"/docProps/thumbnail.emf\" />");
+
+        assertTrue(a != -1);
+        assertTrue(b != -1);
+        assertTrue(a < b);
+    }
+
+    @Test
+    public void testEncrypted() throws Exception {
+        Map<String, String> tests = new HashMap<String, String>();
+        tests.put("testWORD_protected_passtika.docx",
+                "This is an encrypted Word 2007 File");
+
+        Parser parser = new AutoDetectParser();
+        Metadata m = new Metadata();
+        PasswordProvider passwordProvider = new PasswordProvider() {
+            @Override
+            public String getPassword(Metadata metadata) {
+                return "tika";
+            }
+        };
+
+        OfficeParserConfig opc = new OfficeParserConfig();
+        opc.setUseSAXDocxExtractor(true);
+        ParseContext passwordContext = new ParseContext();
+        passwordContext.set(org.apache.tika.parser.PasswordProvider.class, passwordProvider);
+        passwordContext.set(OfficeParserConfig.class, opc);
+        for (Map.Entry<String, String> e : tests.entrySet()) {
+            assertContains(e.getValue(), getXML(e.getKey(), passwordContext).xml);
+        }
+
+        //now try with no password
+        for (Map.Entry<String, String> e : tests.entrySet()) {
+            boolean exc = false;
+            try {
+                getXML(e.getKey(), parseContext);
+            } catch (EncryptedDocumentException ex) {
+                exc = true;
+            }
+            assertTrue(exc);
+        }
+
+    }
+
+    @Test
+    public void testDOCXParagraphNumbering() throws Exception {
+        String xml = getXML("testWORD_numbered_list.docx", parseContext).xml;
+        //SAX parser is getting this.  DOM parser is not
+        assertContains("add a list here", xml);
+/*TODO:
+        assertContains("1) This", xml);
+        assertContains("a) Is", xml);
+        assertContains("i) A multi", xml);
+        assertContains("ii) Level", xml);
+        assertContains("1. Within cell 1", xml);
+        assertContains("b. Cell b", xml);
+        assertContains("iii) List", xml);
+        assertContains("2) foo", xml);
+        assertContains("ii) baz", xml);
+        assertContains("ii) foo", xml);
+        assertContains("II. bar", xml);
+        assertContains("6. six", xml);
+        assertContains("7. seven", xml);
+        assertContains("a. seven a", xml);
+        assertContains("e. seven e", xml);
+        assertContains("2. A ii 2", xml);
+        assertContains("3. page break list 3", xml);
+        assertContains("Some-1-CrazyFormat Greek numbering with crazy format - alpha", xml);
+        assertContains("1.1.1. 1.1.1", xml);
+        assertContains("1.1. 1.2-&gt;1.1  //set the value", xml);
+*/
+    }
+
+    @Test
+    @Ignore("TODO")
+    public void testDOCXOverrideParagraphNumbering() throws Exception {
+        String xml = getXML("testWORD_override_list_numbering.docx").xml;
+
+        //Test 1
+        assertContains("<p>1.1.1.1...1 1.1.1.1...1</p>", xml);
+        assertContains("1st.2.3someText 1st.2.3someText", xml);
+        assertContains("1st.2.2someOtherText.1 1st.2.2someOtherText.1", xml);
+        assertContains("5th 5th", xml);
+
+
+        //Test 2
+        assertContains("1.a.I 1.a.I", xml);
+        //test no reset because level 2 is not sufficient to reset
+        assertContains("<p>1.b.III 1.b.III</p>", xml);
+        //test restarted because of level 0's increment to 2
+        assertContains("2.a.I 2.a.I", xml);
+        //test handling of skipped level
+        assertContains("<p>2.b 2.b</p>", xml);
+
+        //Test 3
+        assertContains("(1)) (1))", xml);
+        //tests start level 1 at 17 and
+        assertContains("2.17 2.17", xml);
+        //tests that isLegal turns everything into decimal
+        assertContains("2.18.2.1 2.18.2.1", xml);
+        assertContains("<p>2 2</p>", xml);
+
+        //Test4
+        assertContains("<p>1 1</p>", xml);
+        assertContains("<p>A A</p>", xml);
+        assertContains("<p>B B</p>", xml);
+        //this tests overrides
+        assertContains("<p>C C</p>", xml);
+        assertContains("<p>4 4</p>", xml);
+
+        //Test5
+        assertContains(">00 00", xml);
+        assertContains(">01 01", xml);
+        assertContains(">01. 01.", xml);
+        assertContains(">01..1 01..1", xml);
+        assertContains(">02 02", xml);
+    }
+
+    @Test
+    public void testMultiAuthorsManagers() throws Exception {
+        XMLResult r = getXML("testWORD_multi_authors.docx", parseContext);
+        String[] authors = r.metadata.getValues(TikaCoreProperties.CREATOR);
+        assertEquals(3, authors.length);
+        assertEquals("author2", authors[1]);
+
+        String[] managers = r.metadata.getValues(OfficeOpenXMLExtended.MANAGER);
+        assertEquals(2, managers.length);
+        assertEquals("manager1", managers[0]);
+        assertEquals("manager2", managers[1]);
+    }
+
+    @Test
+    public void testOrigSourcePath() throws Exception {
+        Metadata embed1_zip_metadata = getRecursiveMetadata("test_recursive_embedded.docx", parseContext).get(11);
+        assertContains("C:\\Users\\tallison\\AppData\\Local\\Temp\\embed1.zip",
+                Arrays.asList(embed1_zip_metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME)));
+        assertContains("C:\\Users\\tallison\\Desktop\\tmp\\New folder (2)\\embed1.zip",
+                Arrays.asList(embed1_zip_metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME)));
+    }
+
+    @Test
+    @Ignore("TODO")
+    public void testBoldHyperlink() throws Exception {
+        //TIKA-1255
+        String xml = getXML("testWORD_boldHyperlink.docx", parseContext).xml;
+        xml = xml.replaceAll("\\s+", " ");
+        assertContains("<a href=\"http://tika.apache.org/\">hyper <b>link</b></a>", xml);
+        assertContains("<a href=\"http://tika.apache.org/\"><b>hyper</b> link</a>; bold", xml);
+    }
+
+    @Test
+    public void testLongForIntExceptionInSummaryDetails() throws Exception {
+        //TIKA-2055
+        assertContains("bold", getXML("testWORD_totalTimeOutOfRange.docx", parseContext).xml);
+    }
+
+    @Test
+    @Ignore("TODO")
+    public void testMacrosInDocm() throws Exception {
+        Metadata minExpected = new Metadata();
+        minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
+        minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
+        minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
+        minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+                TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
+
+        assertContainsAtLeast(minExpected, getRecursiveMetadata("testWORD_macros.docm", parseContext));
+    }
+
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/89430130/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/SXWPFExtractorTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/SXWPFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/SXWPFExtractorTest.java
deleted file mode 100644
index 06f0eed..0000000
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/SXWPFExtractorTest.java
+++ /dev/null
@@ -1,187 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.microsoft.ooxml.xwpf;
-
-import static org.junit.Assert.assertEquals;
-
-import java.util.List;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Office;
-import org.apache.tika.metadata.OfficeOpenXMLCore;
-import org.apache.tika.metadata.OfficeOpenXMLExtended;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.RecursiveParserWrapper;
-import org.apache.tika.parser.microsoft.OfficeParserConfig;
-import org.junit.Test;
-
-
-public class SXWPFExtractorTest extends TikaTest {
-
-    @Test
-    public void basicTest() throws Exception {
-        ParseContext pc = new ParseContext();
-        OfficeParserConfig officeParserConfig = new OfficeParserConfig();
-        officeParserConfig.setUseSAXDocxExtractor(true);
-
-        pc.set(OfficeParserConfig.class, officeParserConfig);
-        List<Metadata> metadataList = getRecursiveMetadata("testWORD_2006ml.docx", pc);
-
-        assertEquals(8, metadataList.size());
-        Metadata m = metadataList.get(0);
-
-        assertEquals("2016-11-29T00:58:00Z", m.get(TikaCoreProperties.CREATED));
-        assertEquals("2016-11-29T17:54:00Z", m.get(TikaCoreProperties.MODIFIED));
-        assertEquals("My Document Title", m.get(TikaCoreProperties.TITLE));
-        assertEquals("This is the Author", m.get(TikaCoreProperties.CREATOR));
-        assertEquals("3", m.get(OfficeOpenXMLCore.REVISION));
-        assertEquals("Allison, Timothy B.", m.get(TikaCoreProperties.MODIFIER));
-        //assertEquals("0", m.get(OfficeOpenXMLExtended.DOC_SECURITY));
-        assertEquals("260", m.get(Office.WORD_COUNT));
-        assertEquals("3", m.get(Office.PARAGRAPH_COUNT));
-        assertEquals("1742", m.get(Office.CHARACTER_COUNT_WITH_SPACES));
-        assertEquals("12", m.get(Office.LINE_COUNT));
-        assertEquals("16.0000", m.get(OfficeOpenXMLExtended.APP_VERSION));
-
-
-        String content = m.get(RecursiveParserWrapper.TIKA_CONTENT);
-
-
-        assertContainsCountTimes("engaging title page", content, 1);
-        assertContainsCountTimes("This is the Author", content, 1);
-        assertContainsCountTimes("This is an engaging title page", content, 1);
-
-        assertContains("My Document Title", content);
-        assertContains("My Document Subtitle", content);
-
-        assertContains("<p>\tHeading1\t3</p>", content);
-
-
-        //TODO: integrate numbering
-        assertContains("Really basic 2.", content);
-
-        assertContainsCountTimes("This is a text box", content, 1);
-
-        assertContains("<p>This is a hyperlink: <a href=\"http://tika.apache.org\">tika</a></p>", content);
-
-        assertContains("<p>This is a link to a local file: <a href=\"file:///C:/data/test.png\">test.png</a></p>", content);
-
-        assertContains("<p>This is          10 spaces</p>", content);
-
-        //caption
-        assertContains("<p>Table 1: Table1 Caption</p>", content);
-
-        //embedded table
-        //TODO: figure out how to handle embedded tables in html
-        assertContains("<p>Embedded table r1c1</p>", content);
-
-        //shape
-        assertContainsCountTimes("<p>This is text within a shape", content, 1);
-
-        //sdt rich text
-        assertContains("<p>Rich text content control", content);
-
-        //sdt simple text
-        assertContains("<p>Simple text content control", content);
-
-        //sdt repeating
-        assertContains("Repeating content", content);
-
-        //sdt dropdown
-        //TODO: get options for dropdown
-        assertContains("Drop down1", content);
-
-        //sdt date
-        assertContains("<p>11/16/2016</p>", content);
-
-        //test that <tab/> works
-        assertContains("tab\ttab", content);
-
-        assertContainsCountTimes("serious word art", content, 1);
-        assertContainsCountTimes("Wordartr1c1", content, 1);
-
-        //glossary document contents
-        assertContains("Click or tap to enter a date", content);
-
-        //basic formatting
-        assertContains("<p>The <i>quick</i> brown <b>fox </b>j<i>um</i><b><i>ped</i></b> over",
-                content);
-
-        //TODO: add chart parsing
-//        assertContains("This is the chart", content);
-
-        assertContains("This is a comment", content);
-
-        assertContains("This is an endnote", content);
-
-        assertContains("this is the footnote", content);
-
-        assertContains("First page header", content);
-
-        assertContains("Even page header", content);
-
-        assertContains("Odd page header", content);
-
-        assertContains("First page footer", content);
-
-        assertContains("Even page footer", content);
-
-        assertContains("Odd page footer", content);
-
-        //test default does not include deleted
-        assertNotContained("frog", content);
-
-        assertContains("Mattmann", content);
-
-        //TODO: extract chart text
-//        assertContains("This is the chart title", content);
-
-    }
-
-    @Test
-    public void testSkipDeleted() throws Exception {
-        ParseContext pc = new ParseContext();
-        OfficeParserConfig officeParserConfig = new OfficeParserConfig();
-        officeParserConfig.setIncludeDeletedContent(true);
-        officeParserConfig.setUseSAXDocxExtractor(true);
-        officeParserConfig.setIncludeMoveFromContent(true);
-        pc.set(OfficeParserConfig.class, officeParserConfig);
-
-        XMLResult r = getXML("testWORD_2006ml.docx", pc);
-        assertContains("frog", r.xml);
-        assertContainsCount("Second paragraph", r.xml, 2);
-
-    }
-
-    private void assertContainsCountTimes(String needle, String haystack, int expectedCount) {
-        int i = haystack.indexOf("engaging title page");
-        int cnt = 0;
-        while (i > -1) {
-            cnt++;
-            i = haystack.indexOf("engaging title page", i+1);
-        }
-        assertEquals("found needle >"+ needle+"<"+cnt+" times instead of expected: "+expectedCount,
-                expectedCount, cnt);
-
-    }
-
-
-
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/89430130/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/tika-config-sax-docx.xml
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/tika-config-sax-docx.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/tika-config-sax-docx.xml
new file mode 100644
index 0000000..cad9c5a
--- /dev/null
+++ b/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/tika-config-sax-docx.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <parsers>
+        <parser class="org.apache.tika.parser.DefaultParser"/>
+        <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser">
+            <params>
+                <param name="useSAXDocxExtractor" type="bool">true</param>
+            </params>
+        </parser>
+    </parsers>
+</properties>


[7/7] tika git commit: update changes for TIKA-2191 and TIKA-2192

Posted by ta...@apache.org.
update changes for TIKA-2191 and TIKA-2192


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/5425d02a
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/5425d02a
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/5425d02a

Branch: refs/heads/master
Commit: 5425d02a1ed97ce5f884a076f55ad8197cc6ac7b
Parents: 615bf75
Author: tballison <ta...@mitre.org>
Authored: Tue Dec 6 09:06:27 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Tue Dec 6 09:06:27 2016 -0500

----------------------------------------------------------------------
 CHANGES.txt | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/5425d02a/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 8a97cd3..2dd9181 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
 Release 1.15 - ??
 
+  * Enabled extraction of embedded objects from headers, footers,
+    footnotes, endnotes and comments in legacy .docx parser (TIKA-2192).
+
   * Allow extraction of PDActions (including Javascript) from
     PDFs (TIKA-2090).
 
@@ -7,7 +10,7 @@ Release 1.15 - ??
     deleted text to align with .doc (TIKA-2187).
 
   * Added experimental SAX parser for .docx files. To select this parser,
-    set useSAXDocxExtractor(true) on OfficeParserConfig (TIKA-1321).
+    set useSAXDocxExtractor(true) on OfficeParserConfig (TIKA-1321, TIKA-2191).
 
   * Change default behavior to parse embedded documents even if the user
     forgets to specify a Parser.class in the ParseContext (TIKA-2096).


[2/7] tika git commit: TIKA-2191 -- step2 -- add handling for docm files...extract macros

Posted by ta...@apache.org.
TIKA-2191 -- step2 -- add handling for docm files...extract macros


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/f93d4e1f
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/f93d4e1f
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/f93d4e1f

Branch: refs/heads/master
Commit: f93d4e1fffdb4a441f7fa750a43691adfa70c392
Parents: 8943013
Author: tballison <ta...@mitre.org>
Authored: Mon Dec 5 11:14:34 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Tue Dec 6 09:01:00 2016 -0500

----------------------------------------------------------------------
 .../ooxml/SXWPFWordExtractorDecorator.java      | 15 +++++--
 .../microsoft/ooxml/SXWPFExtractorTest.java     | 42 +++++++++++---------
 2 files changed, 35 insertions(+), 22 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/f93d4e1f/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
index e08dab1..ee88f15 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
@@ -72,7 +72,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
     protected void buildXHTML(XHTMLContentHandler xhtml)
             throws SAXException, XmlException, IOException {
         //handle main document
-        List<PackagePart> pps = opcPackage.getPartsByContentType(XWPFRelation.DOCUMENT.getContentType());
+        List<PackagePart> pps = getMainDocumentParts();
         if (pps != null) {
             for (PackagePart pp : pps) {
                 //likely only one, but why not...
@@ -81,7 +81,6 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
         }
         //handle glossary document
         pps = opcPackage.getPartsByContentType(XWPFRelation.GLOSSARY_DOCUMENT.getContentType());
-
         if (pps != null) {
             for (PackagePart pp : pps) {
                 //likely only one, but why not...
@@ -145,7 +144,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
                                     new XWPFTikaBodyPartHandler(xhtml, xwpfListManager,
                                             context.get(OfficeParserConfig.class)), hyperlinks))));
         } catch (TikaException e) {
-            e.printStackTrace();
+            //swallow
         }
 
     }
@@ -217,6 +216,14 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
      */
     @Override
     protected List<PackagePart> getMainDocumentParts() {
-        return opcPackage.getPartsByContentType(XWPFRelation.DOCUMENT.getContentType());
+        //figure out which one this is
+        List<PackagePart> pps = opcPackage.getPartsByContentType(XWPFRelation.DOCUMENT.getContentType());
+        if (pps.size() == 0) {
+            pps = opcPackage.getPartsByContentType(XWPFRelation.MACRO_DOCUMENT.getContentType());
+            if (pps.size() == 0) {
+                pps = opcPackage.getPartsByContentType(XWPFRelation.MACRO_TEMPLATE_DOCUMENT.getContentType());
+            }
+        }
+        return pps;
     }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/f93d4e1f/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
index fb7a977..dffa112 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
@@ -148,9 +148,6 @@ public class SXWPFExtractorTest extends TikaTest {
         assertContains("<p>The <i>quick</i> brown <b>fox </b>j<i>um</i><b><i>ped</i></b> over",
                 content);
 
-        //TODO: add chart parsing
-//        assertContains("This is the chart", content);
-
         assertContains("This is a comment", content);
 
         assertContains("This is an endnote", content);
@@ -177,6 +174,9 @@ public class SXWPFExtractorTest extends TikaTest {
         //TODO: extract chart text
 //        assertContains("This is the chart title", content);
 
+        //TODO: add chart parsing
+//        assertContains("This is the chart", content);
+
     }
 
     /**
@@ -261,15 +261,18 @@ public class SXWPFExtractorTest extends TikaTest {
 
         // Text too
         assertTrue(xml.contains("<p>The end!</p>"));
+    }
 
+    @Test
+    public void testContiguousHTMLFormatting() throws Exception {
         // TIKA-692: test document containing multiple
         // character runs within a bold tag:
-        xml = getXML("testWORD_bold_character_runs.docx", parseContext).xml;
+        String xml = getXML("testWORD_bold_character_runs.docx", parseContext).xml;
 
         // Make sure bold text arrived as single
         // contiguous string even though Word parser
         // handled this as 3 character runs
-//TODO:        assertTrue("Bold text wasn't contiguous: " + xml, xml.contains("F<b>oob</b>a<b>r</b>"));
+        assertTrue("Bold text wasn't contiguous: " + xml, xml.contains("F<b>oob</b>a<b>r</b>"));
 
         // TIKA-692: test document containing multiple
         // character runs within a bold tag:
@@ -278,7 +281,7 @@ public class SXWPFExtractorTest extends TikaTest {
         // Make sure bold text arrived as single
         // contiguous string even though Word parser
         // handled this as 3 character runs
-//TODO:        assertTrue("Bold text wasn't contiguous: " + xml, xml.contains("F<b>oob</b>a<b>r</b>"));
+        assertTrue("Bold text wasn't contiguous: " + xml, xml.contains("F<b>oob</b>a<b>r</b>"));
     }
 
     /**
@@ -311,9 +314,9 @@ public class SXWPFExtractorTest extends TikaTest {
 
     @Test
     public void testVarious() throws Exception {
-        XMLResult xmlResult = getXML("testWORD_various.docx", parseContext);
-        String content = xmlResult.xml;
-        Metadata metadata = xmlResult.metadata;
+        Metadata metadata = new Metadata();
+        String content = getText(getResourceAsStream("/test-documents/testWORD_various.docx"),
+                new AutoDetectParser(), parseContext, metadata);
         //content = content.replaceAll("\\s+"," ");
         assertContains("Footnote appears here", content);
         assertContains("This is a footnote.", content);
@@ -328,8 +331,8 @@ public class SXWPFExtractorTest extends TikaTest {
         assertContains("Here is a citation:", content);
         assertContains("Figure 1 This is a caption for Figure 1", content);
         assertContains("(Kramer)", content);
-//TODO:        assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+", " "));
-//TODO:        assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+", " "));
+        assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+", " "));
+        assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+", " "));
         assertContains("This is a hyperlink", content);
         assertContains("Here is a list:", content);
         for (int row = 1; row <= 3; row++) {
@@ -522,7 +525,7 @@ public class SXWPFExtractorTest extends TikaTest {
         String xml = getXML("testDOCX_Thumbnail.docx", parseContext).xml;
         int a = xml.indexOf("This file contains a thumbnail");
         int b = xml.indexOf("<div class=\"embedded\" id=\"/docProps/thumbnail.emf\" />");
-
+        System.out.println(xml);
         assertTrue(a != -1);
         assertTrue(b != -1);
         assertTrue(a < b);
@@ -566,11 +569,11 @@ public class SXWPFExtractorTest extends TikaTest {
     }
 
     @Test
+    @Ignore("TODO -- paragraph list numbers")
     public void testDOCXParagraphNumbering() throws Exception {
         String xml = getXML("testWORD_numbered_list.docx", parseContext).xml;
-        //SAX parser is getting this.  DOM parser is not
+        //SAX parser is getting this.  DOM parser is not!
         assertContains("add a list here", xml);
-/*TODO:
         assertContains("1) This", xml);
         assertContains("a) Is", xml);
         assertContains("i) A multi", xml);
@@ -591,11 +594,11 @@ public class SXWPFExtractorTest extends TikaTest {
         assertContains("Some-1-CrazyFormat Greek numbering with crazy format - alpha", xml);
         assertContains("1.1.1. 1.1.1", xml);
         assertContains("1.1. 1.2-&gt;1.1  //set the value", xml);
-*/
+
     }
 
     @Test
-    @Ignore("TODO")
+    @Ignore("TODO -- paragraph list numbers")
     public void testDOCXOverrideParagraphNumbering() throws Exception {
         String xml = getXML("testWORD_override_list_numbering.docx").xml;
 
@@ -678,8 +681,11 @@ public class SXWPFExtractorTest extends TikaTest {
     }
 
     @Test
-    @Ignore("TODO")
     public void testMacrosInDocm() throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata("testWORD_macros.docm", parseContext);
+        //check that content came out of the .docm file
+        assertContains("quick", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT));
+
         Metadata minExpected = new Metadata();
         minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
         minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
@@ -687,7 +693,7 @@ public class SXWPFExtractorTest extends TikaTest {
         minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                 TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
 
-        assertContainsAtLeast(minExpected, getRecursiveMetadata("testWORD_macros.docm", parseContext));
+        assertContainsAtLeast(minExpected, metadataList);//, parseContext));
     }
 
 


[5/7] tika git commit: TIKA-2191 -- step 5 actually extract images embedded in areas besides the body of docx/m

Posted by ta...@apache.org.
TIKA-2191 -- step 5 actually extract images embedded in areas besides the body of docx/m


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/4469ca2c
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/4469ca2c
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/4469ca2c

Branch: refs/heads/master
Commit: 4469ca2c4ea725e9f5d94c116aaf248deea2a6eb
Parents: 806eaf8
Author: tballison <ta...@mitre.org>
Authored: Tue Dec 6 08:43:59 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Tue Dec 6 09:01:48 2016 -0500

----------------------------------------------------------------------
 .../microsoft/ooxml/AbstractOOXMLExtractor.java |  11 +++-
 .../ooxml/SXWPFWordExtractorDecorator.java      |  47 ++++++++++++++--
 .../ooxml/xwpf/XWPFDocumentXMLBodyHandler.java  |  32 ++++++++---
 .../parser/microsoft/ooxml/OOXMLParserTest.java |  54 ++++++++++++-------
 .../microsoft/ooxml/SXWPFExtractorTest.java     |  35 ++++++++++--
 .../test-documents/testWORD_embedded_pics.docx  | Bin 0 -> 52399 bytes
 6 files changed, 142 insertions(+), 37 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/4469ca2c/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index f9ba8a6..6bc867d 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -22,7 +22,9 @@ import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URI;
+import java.util.HashSet;
 import java.util.List;
+import java.util.Set;
 
 import org.apache.poi.POIXMLDocument;
 import org.apache.poi.POIXMLTextExtractor;
@@ -158,10 +160,17 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
 
     private void handleEmbeddedParts(ContentHandler handler)
             throws TikaException, IOException, SAXException {
+        Set<String> seen = new HashSet<>();
         try {
             for (PackagePart source : getMainDocumentParts()) {
                 for (PackageRelationship rel : source.getRelationships()) {
-
+                    URI targetURI = rel.getTargetURI();
+                    if (targetURI != null) {
+                        if (seen.contains(targetURI.toString())) {
+                            continue;
+                        }
+                        seen.add(targetURI.toString());
+                    }
                     URI sourceURI = rel.getSourceURI();
                     String sourceDesc;
                     if (sourceURI != null) {

http://git-wip-us.apache.org/repos/asf/tika/blob/4469ca2c/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
index 8634cd6..43fca3b 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
@@ -19,6 +19,7 @@ package org.apache.tika.parser.microsoft.ooxml;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
@@ -59,13 +60,21 @@ import org.xml.sax.SAXException;
 public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
 
     private final static String[] EMBEDDED_RELATIONSHIPS = new String[]{
-            RELATION_OLE_OBJECT,
             RELATION_AUDIO,
             RELATION_IMAGE,
             RELATION_PACKAGE,
             RELATION_OFFICE_DOCUMENT
     };
 
+    //include all parts that might have embedded objects
+    private final static String[] MAIN_PART_RELATIONS = new String[]{
+            XWPFRelation.HEADER.getRelation(),
+            XWPFRelation.FOOTER.getRelation(),
+            XWPFRelation.FOOTNOTE.getRelation(),
+            "http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes",
+            "http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments"
+    };
+
     private final OPCPackage opcPackage;
     private final ParseContext context;
 
@@ -82,7 +91,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
     protected void buildXHTML(XHTMLContentHandler xhtml)
             throws SAXException, XmlException, IOException {
         //handle main document
-        List<PackagePart> pps = getMainDocumentParts();
+        List<PackagePart> pps = getStoryDocumentParts();
         if (pps != null) {
             for (PackagePart pp : pps) {
                 //likely only one, but why not...
@@ -176,6 +185,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
             }
 
             for (String rel : EMBEDDED_RELATIONSHIPS) {
+
                 prc = bodyPart.getRelationshipsByType(rel);
                 for (int i = 0; i < prc.size(); i++) {
                     PackageRelationship pr = prc.getRelationship(i);
@@ -247,11 +257,40 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
     }
 
     /**
-     * This returns the main document only.
+     * This returns all items that might contain embedded objects:
+     * main document, headers, footers, comments, etc.
      */
     @Override
     protected List<PackagePart> getMainDocumentParts() {
-        //figure out which one this is
+
+        List<PackagePart> mainStoryDocs = getStoryDocumentParts();
+        List<PackagePart> relatedParts = new ArrayList<>();
+
+        for (PackagePart pp : mainStoryDocs) {
+            addRelatedParts(pp, relatedParts);
+        }
+        relatedParts.addAll(mainStoryDocs);
+        return relatedParts;
+    }
+
+    private void addRelatedParts(PackagePart documentPart, List<PackagePart> relatedParts) {
+        for (String relation : MAIN_PART_RELATIONS) {
+            PackageRelationshipCollection prc = null;
+            try {
+                prc = documentPart.getRelationshipsByType(relation);
+                if (prc != null) {
+                    for (int i = 0; i < prc.size(); i++) {
+                        PackagePart packagePart = documentPart.getRelatedPart(prc.getRelationship(i));
+                        relatedParts.add(packagePart);
+                    }
+                }
+            } catch (InvalidFormatException e) {
+            }
+        }
+
+    }
+
+    private List<PackagePart> getStoryDocumentParts() {
         List<PackagePart> pps = opcPackage.getPartsByContentType(XWPFRelation.DOCUMENT.getContentType());
         if (pps.size() == 0) {
             pps = opcPackage.getPartsByContentType(XWPFRelation.MACRO_DOCUMENT.getContentType());

http://git-wip-us.apache.org/repos/asf/tika/blob/4469ca2c/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
index 2538215..b2e74d1 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
@@ -34,6 +34,7 @@ import org.xml.sax.helpers.DefaultHandler;
 public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
 
 
+
     enum EditType {
         NONE,
         INSERT,
@@ -48,6 +49,7 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
     private final static String O_NS = "urn:schemas-microsoft-com:office:office";
     private final static String PIC_NS = "http://schemas.openxmlformats.org/drawingml/2006/picture";
     private final static String DRAWING_MAIN_NS = "http://schemas.openxmlformats.org/drawingml/2006/main";
+    private static final String V_NS = "urn:schemas-microsoft-com:vml";
 
     private final static String OFFICE_DOC_RELATIONSHIP_NS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships";
 
@@ -71,6 +73,7 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
     private boolean inDelText = false;
 
     private boolean inPic = false;
+    private boolean inPict = false;
     private String picDescription = null;
     private String picRId = null;
     private String picFilename = null;
@@ -154,6 +157,13 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
             }
         }
 
+        if (uri == null || uri.equals(V_NS)) {
+            if ("imagedata".equals(localName)) {
+                picRId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
+                picDescription = atts.getValue(O_NS, "title");
+            }
+        }
+
         if (uri == null || uri.equals(W_NS)) {
             if (localName.equals("p")) {
                 bodyContentsHandler.startParagraph();
@@ -248,14 +258,9 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
 
         if (PIC_NS.equals(uri)) {
             if ("pic".equals(localName)) {
-                String picFileName = null;
-                if (picRId != null) {
-                    picFileName = linkedRelationships.get(picRId);
-                }
-                bodyContentsHandler.embeddedPicRef(picFileName, picDescription);
-                picDescription = null;
-                picRId = null;
+                handlePict();
                 inPic = false;
+                return;
             }
 
         }
@@ -291,10 +296,23 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
                 editType = EditType.NONE;
             } else if (localName.equals("hyperlink")) {
                 bodyContentsHandler.hyperlinkEnd();
+            } else if ("pict".equals(localName)) {
+                handlePict();
             }
         }
     }
 
+    private void handlePict() {
+        String picFileName = null;
+        if (picRId != null) {
+            picFileName = linkedRelationships.get(picRId);
+        }
+        bodyContentsHandler.embeddedPicRef(picFileName, picDescription);
+        picDescription = null;
+        picRId = null;
+        inPic = false;
+    }
+
     @Override
     public void characters(char[] ch, int start, int length) throws SAXException {
 

http://git-wip-us.apache.org/repos/asf/tika/blob/4469ca2c/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 0059d09..a831006 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -468,30 +468,44 @@ public class OOXMLParserTest extends TikaTest {
      * Test that we can extract image from docx header
      */
     @Test
+    @Ignore("fix actual extraction")
     public void testWordPicturesInHeader() throws Exception {
-        Metadata metadata = new Metadata();
-        ParseContext context = new ParseContext();
-
-        StringWriter sw = new StringWriter();
-        SAXTransformerFactory factory = (SAXTransformerFactory)
-                SAXTransformerFactory.newInstance();
-        TransformerHandler handler = factory.newTransformerHandler();
-        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
-        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
-        handler.setResult(new StreamResult(sw));
+        List<Metadata> metadataList = getRecursiveMetadata("headerPic.docx");
+        assertEquals(2, metadataList.size());
+        Metadata m = metadataList.get(0);
+        String mainContent = m.get(RecursiveParserWrapper.TIKA_CONTENT);
+        assertEquals(
+                "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+                m.get(Metadata.CONTENT_TYPE));
+        // Check that custom headings came through
+        assertTrue(mainContent.contains("<img"));
+    }
 
-        // Try with a document containing various tables and formattings
-        try (InputStream input = getTestDocument("headerPic.docx")) {
-            parser.parse(input, handler, metadata, context);
-            String xml = sw.toString();
-            assertEquals(
-                    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-                    metadata.get(Metadata.CONTENT_TYPE));
-            // Check that custom headings came through
-            assertTrue(xml.contains("<img"));
+    @Test
+    @Ignore("not currently extracting from non-body components")
+    public void testPicturesInVariousPlaces() throws Exception {
+        //test that images are actually extracted from
+        //headers, footers, comments, endnotes, footnotes
+        List<Metadata> metadataList = getRecursiveMetadata("testWORD_embedded_pics.docx");
+
+        //only process embedded resources once
+        assertEquals(3, metadataList.size());
+        String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
+        for (int i = 1; i < 4; i++) {
+            assertContains("header"+i+"_pic", content);
+            assertContains("footer"+i+"_pic", content);
         }
+        assertContains("body_pic.jpg", content);
+        assertContains("sdt_pic.jpg", content);
+        assertContains("deeply_embedded_pic", content);
+        assertContains("deleted_pic", content);//TODO: don't extract this
+        assertContains("footnotes_pic", content);
+        assertContains("comments_pic", content);
+        assertContains("endnotes_pic", content);
+//        assertContains("sdt2_pic.jpg", content);//name of file is not stored in image-sdt
+
+        assertContainsCount("<img src=", content, 14);
     }
-
     /**
      * Documents with some sheets are protected, but not all.
      * See TIKA-364.

http://git-wip-us.apache.org/repos/asf/tika/blob/4469ca2c/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
index f4a1aeb..6064be2 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
@@ -290,16 +290,41 @@ public class SXWPFExtractorTest extends TikaTest {
      * Test that we can extract image from docx header
      */
     @Test
-    @Ignore("TODO")
     public void testWordPicturesInHeader() throws Exception {
-        assertEquals(2, getRecursiveMetadata("headerPic.docx").size());
-        XMLResult xmlResult = getXML("headerPic.docx",  parseContext);
+        List<Metadata> metadataList = getRecursiveMetadata("headerPic.docx", parseContext);
+        assertEquals(2, metadataList.size());
+        Metadata m = metadataList.get(0);
+        String mainContent = m.get(RecursiveParserWrapper.TIKA_CONTENT);
             assertEquals(
                     "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-                    xmlResult.metadata.get(Metadata.CONTENT_TYPE));
+                    m.get(Metadata.CONTENT_TYPE));
             // Check that custom headings came through
-            assertTrue(xmlResult.xml.contains("<img"));
+            assertTrue(mainContent.contains("<img"));
+    }
 
+    @Test
+    public void testPicturesInVariousPlaces() throws Exception {
+        //test that images are actually extracted from
+        //headers, footers, comments, endnotes, footnotes
+        List<Metadata> metadataList = getRecursiveMetadata("testWORD_embedded_pics.docx", parseContext);
+
+        //only process embedded resources once
+        assertEquals(3, metadataList.size());
+        String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
+        for (int i = 1; i < 4; i++) {
+            assertContains("header"+i+"_pic", content);
+            assertContains("footer"+i+"_pic", content);
+        }
+        assertContains("body_pic.jpg", content);
+        assertContains("sdt_pic.jpg", content);
+        assertContains("deeply_embedded_pic", content);
+        assertContains("deleted_pic", content);//TODO: don't extract this
+        assertContains("footnotes_pic", content);
+        assertContains("comments_pic", content);
+        assertContains("endnotes_pic", content);
+//        assertContains("sdt2_pic.jpg", content);//name of file is not stored in image-sdt
+
+        assertContainsCount("<img src=", content, 14);
     }
 
     /**

http://git-wip-us.apache.org/repos/asf/tika/blob/4469ca2c/tika-parsers/src/test/resources/test-documents/testWORD_embedded_pics.docx
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testWORD_embedded_pics.docx b/tika-parsers/src/test/resources/test-documents/testWORD_embedded_pics.docx
new file mode 100644
index 0000000..1a63e6f
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testWORD_embedded_pics.docx differ


[3/7] tika git commit: TIKA-2191 -- step 3 -- clean up and tag handling

Posted by ta...@apache.org.
TIKA-2191 -- step 3 -- clean up <b> and <i> tag handling


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/1aca10a2
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/1aca10a2
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/1aca10a2

Branch: refs/heads/master
Commit: 1aca10a26dada02a045a1bc9eb7c3cfc1b36a83e
Parents: f93d4e1
Author: tballison <ta...@mitre.org>
Authored: Mon Dec 5 12:17:56 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Tue Dec 6 09:01:16 2016 -0500

----------------------------------------------------------------------
 .../ooxml/xwpf/XWPFDocumentXMLBodyHandler.java  | 19 ++----
 .../ooxml/xwpf/XWPFEventBasedWordExtractor.java |  9 ++-
 .../ooxml/xwpf/XWPFTikaBodyPartHandler.java     | 71 +++++++++++++++-----
 .../microsoft/ooxml/SXWPFExtractorTest.java     |  4 +-
 4 files changed, 70 insertions(+), 33 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/1aca10a2/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
index dce36a2..9e5ce6b 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
@@ -65,7 +65,6 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
     private boolean inRPr = false;
     private boolean inNumPr = false;
     private boolean inDelText = false;
-    private boolean inHyperlink = false;
 
     //alternate content can be embedded in itself.
     //need to track depth.
@@ -73,7 +72,6 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
     private int inACChoiceDepth = 0;
     private int inACFallbackDepth = 0;
     private EditType editType = EditType.NONE;
-    private String hyperlink = null;
 
     private XWPFRunProperties currRunProperties = new XWPFRunProperties();
 
@@ -151,10 +149,11 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
                 startEditedSection(editType.MOVE_FROM, atts);
             } else if (localName.equals("hyperlink")) {
                 String hyperlinkId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
+                String hyperlink = null;
                 if (hyperlinkId != null) {
                     hyperlink = hyperlinks.get(hyperlinkId);
                 }
-                inHyperlink = true;
+                bodyContentsHandler.hyperlinkStart(hyperlink);
             } else if (localName.equals("footnoteReference")) {
                 String id = atts.getValue(W_NS, "id");
                 bodyContentsHandler.footnoteReference(id);
@@ -210,7 +209,7 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
             }
 
 
-            if (localName.equals("r") && !inHyperlink) {
+            if (localName.equals("r")) {
                 bodyContentsHandler.run(currRunProperties, runBuffer.toString());
                 inR = false;
                 runBuffer.setLength(0);
@@ -235,13 +234,7 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
                     localName.equals("moveTo") || localName.equals("moveFrom")) {
                 editType = EditType.NONE;
             } else if (localName.equals("hyperlink")) {
-                if (hyperlink != null) {
-                    bodyContentsHandler.hyperlinkRun(hyperlink, runBuffer.toString());
-                } else {
-                    bodyContentsHandler.run(currRunProperties, runBuffer.toString());
-                }
-                runBuffer.setLength(0);
-                inHyperlink = false;
+                bodyContentsHandler.hyperlinkEnd();
             }
         }
     }
@@ -281,7 +274,9 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
 
         void run(XWPFRunProperties runProperties, String contents);
 
-        void hyperlinkRun(String link, String text);
+        void hyperlinkStart(String link);
+
+        void hyperlinkEnd();
 
         void startParagraph();
 

http://git-wip-us.apache.org/repos/asf/tika/blob/1aca10a2/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
index 06ef951..4ee7a4f 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
@@ -265,8 +265,13 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
         }
 
         @Override
-        public void hyperlinkRun(String link, String text) {
-            buffer.append(" (").append(text).append(") ");
+        public void hyperlinkStart(String link) {
+            //no-op
+        }
+
+        @Override
+        public void hyperlinkEnd() {
+            //no-op
         }
 
         @Override

http://git-wip-us.apache.org/repos/asf/tika/blob/1aca10a2/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
index 2f27739..d62e270 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
@@ -38,6 +38,7 @@ public class XWPFTikaBodyPartHandler implements XWPFDocumentXMLBodyHandler.XWPFB
     private int pDepth = 0; //paragraph depth
     private boolean isItalics = false;
     private boolean isBold = false;
+    private boolean wroteHyperlinkStart = false;
 
     public XWPFTikaBodyPartHandler(XHTMLContentHandler xhtml, XWPFListManager listManager, OfficeParserConfig parserConfig) {
         this.xhtml = xhtml;
@@ -48,37 +49,57 @@ public class XWPFTikaBodyPartHandler implements XWPFDocumentXMLBodyHandler.XWPFB
 
     @Override
     public void run(XWPFRunProperties runProperties, String contents) {
-        //TODO: smooth out bold/italics to handle only changes
-        //If two runs are bold, only add <b> at beginning and end of the run pair
         try {
-            if (runProperties.getBold()) {
-                xhtml.startElement("b");
+            // True if we are currently in the named style tag:
+            if (runProperties.getBold() != isBold) {
+                if (isItalics) {
+                    xhtml.endElement("i");
+                    isItalics = false;
+                }
+                if (runProperties.getBold()) {
+                    xhtml.startElement("b");
+                    isBold = true;
+                } else {
+                    xhtml.endElement("b");
+                    isBold = false;
+                }
             }
-            if (runProperties.getItalics()) {
-                xhtml.startElement("i");
+
+            if (runProperties.getItalics() != isItalics) {
+                if (runProperties.getItalics()) {
+                    xhtml.startElement("i");
+                    isItalics = true;
+                } else {
+                    xhtml.endElement("i");
+                    isItalics = false;
+                }
             }
 
             xhtml.characters(contents);
-            if (runProperties.getItalics()) {
-                xhtml.endElement("i");
-            }
-            if (runProperties.getBold()) {
-                xhtml.endElement("b");
-            }
+
         } catch (SAXException e) {
 
         }
     }
 
     @Override
-    public void hyperlinkRun(String link, String text) {
-        //System.out.println("tika handler: "+link + " :: "+text);
+    public void hyperlinkStart(String link) {
         try {
             if (link != null) {
                 xhtml.startElement("a", "href", link);
+                wroteHyperlinkStart = true;
             }
-            xhtml.characters(text);
-            if (link != null) {
+        } catch (SAXException e) {
+
+        }
+    }
+
+    @Override
+    public void hyperlinkEnd() {
+        try {
+            if (wroteHyperlinkStart) {
+                closeStyleTags();
+                wroteHyperlinkStart = false;
                 xhtml.endElement("a");
             }
         } catch (SAXException e) {
@@ -101,6 +122,7 @@ public class XWPFTikaBodyPartHandler implements XWPFDocumentXMLBodyHandler.XWPFB
     @Override
     public void endParagraph() {
         try {
+            closeStyleTags();
             if (pDepth == 1) {
                 xhtml.endElement("p");
             } else {
@@ -168,7 +190,11 @@ public class XWPFTikaBodyPartHandler implements XWPFDocumentXMLBodyHandler.XWPFB
 
     @Override
     public void startSDT() {
-        //no-op
+        try {
+            closeStyleTags();
+        } catch (SAXException e) {
+
+        }
     }
 
     @Override
@@ -221,4 +247,15 @@ public class XWPFTikaBodyPartHandler implements XWPFDocumentXMLBodyHandler.XWPFB
     public boolean getIncludeMoveFromText() {
         return includeMoveFromText;
     }
+
+    private void closeStyleTags() throws SAXException {
+        if (isItalics) {
+            xhtml.endElement("i");
+            isItalics = false;
+        }
+        if (isBold) {
+            xhtml.endElement("b");
+            isBold = false;
+        }
+    }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/1aca10a2/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
index dffa112..22e5644 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
@@ -144,7 +144,7 @@ public class SXWPFExtractorTest extends TikaTest {
         //glossary document contents
         assertContains("Click or tap to enter a date", content);
 
-        //basic formatting
+        //basic b/i tags...make sure not to overlap!
         assertContains("<p>The <i>quick</i> brown <b>fox </b>j<i>um</i><b><i>ped</i></b> over",
                 content);
 
@@ -665,10 +665,10 @@ public class SXWPFExtractorTest extends TikaTest {
     }
 
     @Test
-    @Ignore("TODO")
     public void testBoldHyperlink() throws Exception {
         //TIKA-1255
         String xml = getXML("testWORD_boldHyperlink.docx", parseContext).xml;
+        System.out.println(xml);
         xml = xml.replaceAll("\\s+", " ");
         assertContains("<a href=\"http://tika.apache.org/\">hyper <b>link</b></a>", xml);
         assertContains("<a href=\"http://tika.apache.org/\"><b>hyper</b> link</a>; bold", xml);


[4/7] tika git commit: TIKA-2191 -- step 4-- add markup for embedded pics

Posted by ta...@apache.org.
TIKA-2191 -- step 4-- add markup for embedded pics


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/806eaf8b
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/806eaf8b
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/806eaf8b

Branch: refs/heads/master
Commit: 806eaf8b1802a3a3071a5ae0bdc35c20d6739280
Parents: 1aca10a
Author: tballison <ta...@mitre.org>
Authored: Mon Dec 5 13:28:27 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Tue Dec 6 09:01:34 2016 -0500

----------------------------------------------------------------------
 .../ooxml/SXWPFWordExtractorDecorator.java      | 47 ++++++++++++--
 .../ooxml/xwpf/XWPFDocumentXMLBodyHandler.java  | 66 +++++++++++++++++++-
 .../ooxml/xwpf/XWPFEventBasedWordExtractor.java | 10 +++
 .../ooxml/xwpf/XWPFTikaBodyPartHandler.java     | 38 +++++++++++
 .../microsoft/ooxml/SXWPFExtractorTest.java     | 44 ++++++++++++-
 5 files changed, 193 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/806eaf8b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
index ee88f15..8634cd6 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
@@ -16,6 +16,7 @@
  */
 package org.apache.tika.parser.microsoft.ooxml;
 
+import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.HashMap;
@@ -29,6 +30,7 @@ import org.apache.poi.openxml4j.opc.OPCPackage;
 import org.apache.poi.openxml4j.opc.PackagePart;
 import org.apache.poi.openxml4j.opc.PackageRelationship;
 import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
+import org.apache.poi.openxml4j.opc.internal.FileHelper;
 import org.apache.poi.xwpf.usermodel.XWPFNumbering;
 import org.apache.poi.xwpf.usermodel.XWPFRelation;
 import org.apache.tika.exception.TikaException;
@@ -56,10 +58,18 @@ import org.xml.sax.SAXException;
  */
 public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
 
+    private final static String[] EMBEDDED_RELATIONSHIPS = new String[]{
+            RELATION_OLE_OBJECT,
+            RELATION_AUDIO,
+            RELATION_IMAGE,
+            RELATION_PACKAGE,
+            RELATION_OFFICE_DOCUMENT
+    };
 
     private final OPCPackage opcPackage;
     private final ParseContext context;
 
+
     public SXWPFWordExtractorDecorator(ParseContext context,
                                        XWPFEventBasedWordExtractor extractor) {
         super(context, extractor);
@@ -135,22 +145,22 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
     private void handlePart(PackagePart packagePart,
                             XWPFListManager xwpfListManager, XHTMLContentHandler xhtml) throws IOException, SAXException {
 
-        Map<String, String> hyperlinks = loadHyperlinkRelationships(packagePart);
+        Map<String, String> linkedRelationships = loadLinkedRelationships(packagePart);
         try (InputStream stream = packagePart.getInputStream()) {
             context.getSAXParser().parse(
                     new CloseShieldInputStream(stream),
                     new OfflineContentHandler(new EmbeddedContentHandler(
                             new XWPFDocumentXMLBodyHandler(
                                     new XWPFTikaBodyPartHandler(xhtml, xwpfListManager,
-                                            context.get(OfficeParserConfig.class)), hyperlinks))));
+                                            context.get(OfficeParserConfig.class)), linkedRelationships))));
         } catch (TikaException e) {
             //swallow
         }
 
     }
 
-    private Map<String, String> loadHyperlinkRelationships(PackagePart bodyPart) {
-        Map<String, String> hyperlinks = new HashMap<>();
+    private Map<String, String> loadLinkedRelationships(PackagePart bodyPart) {
+        Map<String, String> linkedRelationships = new HashMap<>();
         try {
             PackageRelationshipCollection prc = bodyPart.getRelationshipsByType(XWPFRelation.HYPERLINK.getRelation());
             for (int i = 0; i < prc.size(); i++) {
@@ -161,12 +171,37 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
                 String id = pr.getId();
                 String url = (pr.getTargetURI() == null) ? null : pr.getTargetURI().toString();
                 if (id != null && url != null) {
-                    hyperlinks.put(id, url);
+                    linkedRelationships.put(id, url);
+                }
+            }
+
+            for (String rel : EMBEDDED_RELATIONSHIPS) {
+                prc = bodyPart.getRelationshipsByType(rel);
+                for (int i = 0; i < prc.size(); i++) {
+                    PackageRelationship pr = prc.getRelationship(i);
+                    if (pr == null) {
+                        continue;
+                    }
+                    String id = pr.getId();
+                    String uriString = (pr.getTargetURI() == null) ? null : pr.getTargetURI().toString();
+                    String fileName = uriString;
+                    if (pr.getTargetURI() != null) {
+                        try {
+                            fileName = FileHelper.getFilename(new File(fileName));
+                        } catch (Exception e) {
+                            fileName = uriString;
+                        }
+                    }
+                    if (id != null) {
+                        fileName = (fileName == null) ? "" : fileName;
+                        linkedRelationships.put(id, fileName);
+                    }
                 }
             }
+
         } catch (InvalidFormatException e) {
         }
-        return hyperlinks;
+        return linkedRelationships;
     }
 /*
     private XWPFStyles loadStyles(PackagePart packagePart) {

http://git-wip-us.apache.org/repos/asf/tika/blob/806eaf8b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
index 9e5ce6b..2538215 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
@@ -45,6 +45,10 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
 
     private final static String W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
     private final static String MC_NS = "http://schemas.openxmlformats.org/markup-compatibility/2006";
+    private final static String O_NS = "urn:schemas-microsoft-com:office:office";
+    private final static String PIC_NS = "http://schemas.openxmlformats.org/drawingml/2006/picture";
+    private final static String DRAWING_MAIN_NS = "http://schemas.openxmlformats.org/drawingml/2006/main";
+
     private final static String OFFICE_DOC_RELATIONSHIP_NS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships";
 
     private final static char[] TAB = new char[1];
@@ -55,7 +59,7 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
 
     private final XWPFBodyContentsHandler bodyContentsHandler;
     //private final RelationshipsManager relationshipsManager;
-    private final Map<String, String> hyperlinks;
+    private final Map<String, String> linkedRelationships;
 
     private final StringBuilder runBuffer = new StringBuilder();
 
@@ -66,6 +70,11 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
     private boolean inNumPr = false;
     private boolean inDelText = false;
 
+    private boolean inPic = false;
+    private String picDescription = null;
+    private String picRId = null;
+    private String picFilename = null;
+
     //alternate content can be embedded in itself.
     //need to track depth.
     //if in alternate, choose fallback, maybe make this configurable?
@@ -78,7 +87,7 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
     public XWPFDocumentXMLBodyHandler(XWPFBodyContentsHandler bodyContentsHandler,
                                       Map<String, String> hyperlinks) {
         this.bodyContentsHandler = bodyContentsHandler;
-        this.hyperlinks = hyperlinks;
+        this.linkedRelationships = hyperlinks;
     }
 
 
@@ -111,6 +120,39 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
         if (inACChoiceDepth > 0) {
             return;
         }
+        if (uri == null || uri.equals(O_NS)) {
+            if (localName.equals("OLEObject")) {
+                String type = null;
+                String refId = null;
+                //TODO: want to get ProgID?
+                for (int i = 0; i < atts.getLength(); i++) {
+                    String attLocalName = atts.getLocalName(i);
+                    String attValue = atts.getValue(i);
+                    if (attLocalName.equals("Type")) {
+                        type = attValue;
+                    } else if (OFFICE_DOC_RELATIONSHIP_NS.equals(atts.getURI(i)) && attLocalName.equals("id")) {
+                        refId = attValue;
+                    }
+                }
+                if ("Embed".equals(type)) {
+                    bodyContentsHandler.embeddedOLERef(refId);
+                }
+            }
+        }
+
+        if (uri == null || uri.equals(PIC_NS)) {
+            if ("pic".equals(localName)) {
+                inPic = true;
+            } else if ("cNvPr".equals(localName)) {
+                picDescription = atts.getValue("", "descr");
+            }
+        }
+
+        if (uri == null || uri.equals(DRAWING_MAIN_NS)) {
+            if ("blip".equals(localName)) {
+                picRId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "embed");
+            }
+        }
 
         if (uri == null || uri.equals(W_NS)) {
             if (localName.equals("p")) {
@@ -151,7 +193,7 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
                 String hyperlinkId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
                 String hyperlink = null;
                 if (hyperlinkId != null) {
-                    hyperlink = hyperlinks.get(hyperlinkId);
+                    hyperlink = linkedRelationships.get(hyperlinkId);
                 }
                 bodyContentsHandler.hyperlinkStart(hyperlink);
             } else if (localName.equals("footnoteReference")) {
@@ -203,6 +245,20 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
                 inACFallbackDepth--;
             }
         }
+
+        if (PIC_NS.equals(uri)) {
+            if ("pic".equals(localName)) {
+                String picFileName = null;
+                if (picRId != null) {
+                    picFileName = linkedRelationships.get(picRId);
+                }
+                bodyContentsHandler.embeddedPicRef(picFileName, picDescription);
+                picDescription = null;
+                picRId = null;
+                inPic = false;
+            }
+
+        }
         if (uri == null || uri.equals(W_NS)) {
             if (inACChoiceDepth > 0) {
                 return;
@@ -309,5 +365,9 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
         void endnoteReference(String id);
 
         boolean getIncludeMoveFromText();
+
+        void embeddedOLERef(String refId);
+
+        void embeddedPicRef(String picFileName, String picDescription);
     }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/806eaf8b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
index 4ee7a4f..ee6bb85 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
@@ -353,6 +353,16 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
         public boolean getIncludeMoveFromText() {
             return false;
         }
+
+        @Override
+        public void embeddedOLERef(String refId) {
+            //no-op
+        }
+
+        @Override
+        public void embeddedPicRef(String picFileName, String picDescription) {
+            //no-op
+        }
     }
 }
 

http://git-wip-us.apache.org/repos/asf/tika/blob/806eaf8b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
index d62e270..cd28583 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
@@ -24,6 +24,7 @@ import org.apache.tika.parser.microsoft.OfficeParserConfig;
 import org.apache.tika.parser.microsoft.ooxml.XWPFListManager;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
 
 public class XWPFTikaBodyPartHandler implements XWPFDocumentXMLBodyHandler.XWPFBodyContentsHandler {
 
@@ -248,6 +249,43 @@ public class XWPFTikaBodyPartHandler implements XWPFDocumentXMLBodyHandler.XWPFB
         return includeMoveFromText;
     }
 
+    @Override
+    public void embeddedOLERef(String relId) {
+        if (relId == null) {
+            return;
+        }
+        try {
+            AttributesImpl attributes = new AttributesImpl();
+            attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+            attributes.addAttribute("", "id", "id", "CDATA", relId);
+            xhtml.startElement("div", attributes);
+            xhtml.endElement("div");
+
+        } catch (SAXException e) {
+
+        }
+    }
+
+    @Override
+    public void embeddedPicRef(String picFileName, String picDescription) {
+
+        try {
+            AttributesImpl attr = new AttributesImpl();
+            if (picFileName != null) {
+                attr.addAttribute("", "src", "src", "CDATA", "embedded:" + picFileName);
+            }
+            if (picDescription != null) {
+                attr.addAttribute("", "alt", "alt", "CDATA", picDescription);
+            }
+
+            xhtml.startElement("img", attr);
+            xhtml.endElement("img");
+
+        } catch (SAXException e) {
+
+        }
+    }
+
     private void closeStyleTags() throws SAXException {
         if (isItalics) {
             xhtml.endElement("i");

http://git-wip-us.apache.org/repos/asf/tika/blob/806eaf8b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
index 22e5644..f4a1aeb 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
@@ -22,6 +22,7 @@ import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 
 import java.io.ByteArrayOutputStream;
+import java.io.File;
 import java.io.InputStream;
 import java.io.PrintStream;
 import java.util.Arrays;
@@ -38,6 +39,7 @@ import org.apache.tika.metadata.OfficeOpenXMLCore;
 import org.apache.tika.metadata.OfficeOpenXMLExtended;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.EmptyParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.PasswordProvider;
@@ -418,13 +420,13 @@ public class SXWPFExtractorTest extends TikaTest {
 
     // TIKA-989:
     @Test
-    @Ignore("TODO")
     public void testEmbeddedPDF() throws Exception {
         String xml = getXML("testWORD_embedded_pdf.docx", parseContext).xml;
+        System.out.println(xml);
         int i = xml.indexOf("Here is the pdf file:");
-        int j = xml.indexOf("<div class=\"embedded\" id=\"rId5\"/>");
+        int j = xml.indexOf("<div class=\"embedded\" id=\"rId5\" />");
         int k = xml.indexOf("Bye Bye");
-        int l = xml.indexOf("<div class=\"embedded\" id=\"rId6\"/>");
+        int l = xml.indexOf("<div class=\"embedded\" id=\"rId6\" />");
         int m = xml.indexOf("Bye for real.");
         assertTrue(i != -1);
         assertTrue(j != -1);
@@ -696,5 +698,41 @@ public class SXWPFExtractorTest extends TikaTest {
         assertContainsAtLeast(minExpected, metadataList);//, parseContext));
     }
 
+    @Test
+    public void testEmbedded() throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata("testWORD_embeded.docx", parseContext);
+        Metadata main = metadataList.get(0);
+        String content = main.get(RecursiveParserWrapper.TIKA_CONTENT);
+        //make sure mark up is there
+        assertContains("<img src=\"embedded:image2.jpeg\" alt=\"A description...\" />",
+                content);
+
+        assertContains("<div class=\"embedded\" id=\"rId8\" />",
+                content);
+
+        assertEquals(16, metadataList.size());
+    }
+
+    @Test
+    public void iterate() throws Exception {
+        ParseContext context = new ParseContext();
+        context.set(Parser.class, EmptyParser.INSTANCE);
+        for (File f : getResourceAsFile("/test-documents").listFiles()) {
+            if (! f.getName().equals("testWORD_embeded.docx")) {
+                continue;
+            }
+            if (f.getName().endsWith("docx") || f.getName().endsWith(".docm")) {
+                try {
+                    XMLResult r = getXML(f.getName(), context);
+                    if (r.xml.contains("<img")) {
+                        System.out.println(f.getName());
+                    }
+                    System.out.println(r.xml);
+                } catch (Exception e) {
+                    e.printStackTrace();
+                }
+            }
+        }
+    }
 
 }


[6/7] tika git commit: TIKA-2192 - add extraction of embedded objects in DOM docx parser from more than just main document

Posted by ta...@apache.org.
TIKA-2192 - add extraction of embedded objects in DOM docx parser from more than just main document


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/615bf75f
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/615bf75f
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/615bf75f

Branch: refs/heads/master
Commit: 615bf75fc11e8fc299be550b8cd4bb24f45a264a
Parents: 4469ca2
Author: tballison <ta...@mitre.org>
Authored: Tue Dec 6 09:04:51 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Tue Dec 6 09:04:51 2016 -0500

----------------------------------------------------------------------
 .../ooxml/XWPFWordExtractorDecorator.java       | 35 ++++++++++++++++++--
 .../parser/microsoft/ooxml/OOXMLParserTest.java |  3 +-
 2 files changed, 34 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/615bf75f/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
index ccbf45e..a9eb93f 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
@@ -21,7 +21,9 @@ import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 
+import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
 import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
 import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
 import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
 import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
@@ -38,6 +40,7 @@ import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun;
 import org.apache.poi.xwpf.usermodel.XWPFParagraph;
 import org.apache.poi.xwpf.usermodel.XWPFPicture;
 import org.apache.poi.xwpf.usermodel.XWPFPictureData;
+import org.apache.poi.xwpf.usermodel.XWPFRelation;
 import org.apache.poi.xwpf.usermodel.XWPFRun;
 import org.apache.poi.xwpf.usermodel.XWPFSDT;
 import org.apache.poi.xwpf.usermodel.XWPFSDTCell;
@@ -66,6 +69,16 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
     private static final String LIST_DELIMITER = " ";
 
 
+    //include all parts that might have embedded objects
+    private final static String[] MAIN_PART_RELATIONS = new String[]{
+            XWPFRelation.HEADER.getRelation(),
+            XWPFRelation.FOOTER.getRelation(),
+            XWPFRelation.FOOTNOTE.getRelation(),
+            "http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes",
+            "http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments"
+    };
+
+
     private XWPFDocument document;
     private XWPFStyles styles;
 
@@ -438,16 +451,34 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
     }
 
     /**
-     * Word documents are simple, they only have the one
-     * main part
+     * Include main body and anything else that can
+     * have an attachment/embedded object
      */
     @Override
     protected List<PackagePart> getMainDocumentParts() {
         List<PackagePart> parts = new ArrayList<PackagePart>();
         parts.add(document.getPackagePart());
+        addRelatedParts(document.getPackagePart(), parts);
         return parts;
     }
 
+    private void addRelatedParts(PackagePart documentPart, List<PackagePart> relatedParts) {
+        for (String relation : MAIN_PART_RELATIONS) {
+            PackageRelationshipCollection prc = null;
+            try {
+                prc = documentPart.getRelationshipsByType(relation);
+                if (prc != null) {
+                    for (int i = 0; i < prc.size(); i++) {
+                        PackagePart packagePart = documentPart.getRelatedPart(prc.getRelationship(i));
+                        relatedParts.add(packagePart);
+                    }
+                }
+            } catch (InvalidFormatException e) {
+            }
+        }
+
+    }
+
     private class TmpFormatting {
         private boolean bold = false;
         private boolean italic = false;

http://git-wip-us.apache.org/repos/asf/tika/blob/615bf75f/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index a831006..e84f6d0 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -468,7 +468,6 @@ public class OOXMLParserTest extends TikaTest {
      * Test that we can extract image from docx header
      */
     @Test
-    @Ignore("fix actual extraction")
     public void testWordPicturesInHeader() throws Exception {
         List<Metadata> metadataList = getRecursiveMetadata("headerPic.docx");
         assertEquals(2, metadataList.size());
@@ -482,7 +481,7 @@ public class OOXMLParserTest extends TikaTest {
     }
 
     @Test
-    @Ignore("not currently extracting from non-body components")
+    @Ignore("need to add links in xhtml")
     public void testPicturesInVariousPlaces() throws Exception {
         //test that images are actually extracted from
         //headers, footers, comments, endnotes, footnotes