You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/12/06 14:06:44 UTC
[1/7] tika git commit: TIKA-2191 -- step1 -- add other docx tests and
comment/ignore where appropriate
Repository: tika
Updated Branches:
refs/heads/master 99b592437 -> 5425d02a1
TIKA-2191 -- step1 -- add other docx tests and comment/ignore where appropriate
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/89430130
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/89430130
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/89430130
Branch: refs/heads/master
Commit: 894301307da5167c95585688f9448d3050f53aaa
Parents: 99b5924
Author: tballison <ta...@mitre.org>
Authored: Mon Dec 5 10:10:37 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Tue Dec 6 09:00:37 2016 -0500
----------------------------------------------------------------------
.../parser/microsoft/ooxml/OOXMLParserTest.java | 16 +
.../microsoft/ooxml/SXWPFExtractorTest.java | 694 +++++++++++++++++++
.../ooxml/xwpf/SXWPFExtractorTest.java | 187 -----
.../parser/microsoft/tika-config-sax-docx.xml | 27 +
4 files changed, 737 insertions(+), 187 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/89430130/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index bfbd8ce..0059d09 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -18,6 +18,7 @@ package org.apache.tika.parser.microsoft.ooxml;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import javax.xml.transform.OutputKeys;
@@ -37,6 +38,7 @@ import java.util.Locale;
import java.util.Map;
import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@@ -1330,6 +1332,20 @@ public class OOXMLParserTest extends TikaTest {
System.out.println("elapsed: "+(new Date().getTime()-started) + " with " + ex + " exceptions");
}
+ @Test
+ public void testInitializationViaConfig() throws Exception {
+ //NOTE: this test relies on a bug in the DOM extractor that
+ //is passing over the title information.
+ //once we fix that, this test will no longer be meaningful!
+ InputStream is = getClass().getResourceAsStream("/org/apache/tika/parser/microsoft/tika-config-sax-docx.xml");
+ assertNotNull(is);
+ TikaConfig tikaConfig = new TikaConfig(is);
+ AutoDetectParser p = new AutoDetectParser(tikaConfig);
+ XMLResult xml = getXML("testWORD_2006ml.docx", p, new Metadata());
+ assertContains("engaging title", xml.xml);
+
+ }
+
}
http://git-wip-us.apache.org/repos/asf/tika/blob/89430130/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
new file mode 100644
index 0000000..fb7a977
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
@@ -0,0 +1,694 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.ByteArrayOutputStream;
+import java.io.InputStream;
+import java.io.PrintStream;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.parser.microsoft.OfficeParserConfig;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Before;
+import org.junit.Ignore;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+
+public class SXWPFExtractorTest extends TikaTest {
+
+ private ParseContext parseContext;
+
+ @Before
+ public void setUp() {
+ parseContext = new ParseContext();
+ OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+ officeParserConfig.setUseSAXDocxExtractor(true);
+ parseContext.set(OfficeParserConfig.class, officeParserConfig);
+
+ }
+
+ @Test
+ public void basicTest() throws Exception {
+
+ List<Metadata> metadataList = getRecursiveMetadata("testWORD_2006ml.docx", parseContext);
+
+ assertEquals(8, metadataList.size());
+ Metadata m = metadataList.get(0);
+
+ assertEquals("2016-11-29T00:58:00Z", m.get(TikaCoreProperties.CREATED));
+ assertEquals("2016-11-29T17:54:00Z", m.get(TikaCoreProperties.MODIFIED));
+ assertEquals("My Document Title", m.get(TikaCoreProperties.TITLE));
+ assertEquals("This is the Author", m.get(TikaCoreProperties.CREATOR));
+ assertEquals("3", m.get(OfficeOpenXMLCore.REVISION));
+ assertEquals("Allison, Timothy B.", m.get(TikaCoreProperties.MODIFIER));
+ //assertEquals("0", m.get(OfficeOpenXMLExtended.DOC_SECURITY));
+ assertEquals("260", m.get(Office.WORD_COUNT));
+ assertEquals("3", m.get(Office.PARAGRAPH_COUNT));
+ assertEquals("1742", m.get(Office.CHARACTER_COUNT_WITH_SPACES));
+ assertEquals("12", m.get(Office.LINE_COUNT));
+ assertEquals("16.0000", m.get(OfficeOpenXMLExtended.APP_VERSION));
+
+
+ String content = m.get(RecursiveParserWrapper.TIKA_CONTENT);
+
+ assertContainsCount("engaging title page", content, 1);
+ //need \n to differentiate from metadata values
+ assertContainsCount("This is the Author\n", content, 1);
+ assertContainsCount("This is an engaging title page", content, 1);
+
+ assertContains("My Document Title", content);
+ assertContains("My Document Subtitle", content);
+
+ assertContains("<p>\tHeading1\t3</p>", content);
+
+
+ //TODO: integrate numbering
+ assertContains("Really basic 2.", content);
+
+ assertContainsCount("This is a text box", content, 1);
+
+ assertContains("<p>This is a hyperlink: <a href=\"http://tika.apache.org\">tika</a></p>", content);
+
+ assertContains("<p>This is a link to a local file: <a href=\"file:///C:/data/test.png\">test.png</a></p>", content);
+
+ assertContains("<p>This is 10 spaces</p>", content);
+
+ //caption
+ assertContains("<p>Table 1: Table1 Caption</p>", content);
+
+ //embedded table
+ //TODO: figure out how to handle embedded tables in html
+ assertContains("<p>Embedded table r1c1</p>", content);
+
+ //shape
+ assertContainsCount("<p>This is text within a shape", content, 1);
+
+ //sdt rich text
+ assertContains("<p>Rich text content control", content);
+
+ //sdt simple text
+ assertContains("<p>Simple text content control", content);
+
+ //sdt repeating
+ assertContains("Repeating content", content);
+
+ //sdt dropdown
+ //TODO: get options for dropdown
+ assertContains("Drop down1", content);
+
+ //sdt date
+ assertContains("<p>11/16/2016</p>", content);
+
+ //test that <tab/> works
+ assertContains("tab\ttab", content);
+
+ assertContainsCount("serious word art", content, 1);
+ assertContainsCount("Wordartr1c1", content, 1);
+
+ //glossary document contents
+ assertContains("Click or tap to enter a date", content);
+
+ //basic formatting
+ assertContains("<p>The <i>quick</i> brown <b>fox </b>j<i>um</i><b><i>ped</i></b> over",
+ content);
+
+ //TODO: add chart parsing
+// assertContains("This is the chart", content);
+
+ assertContains("This is a comment", content);
+
+ assertContains("This is an endnote", content);
+
+ assertContains("this is the footnote", content);
+
+ assertContains("First page header", content);
+
+ assertContains("Even page header", content);
+
+ assertContains("Odd page header", content);
+
+ assertContains("First page footer", content);
+
+ assertContains("Even page footer", content);
+
+ assertContains("Odd page footer", content);
+
+ //test default does not include deleted
+ assertNotContained("frog", content);
+
+ assertContains("Mattmann", content);
+
+ //TODO: extract chart text
+// assertContains("This is the chart title", content);
+
+ }
+
+ /**
+ * Test the plain text output of the Word converter
+ *
+ * @throws Exception
+ */
+ @Test
+ public void testWord() throws Exception {
+
+ XMLResult xmlResult = getXML("testWORD.docx", parseContext);
+ assertEquals(
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ xmlResult.metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Sample Word Document", xmlResult.metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Keith Bennett", xmlResult.metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Keith Bennett", xmlResult.metadata.get(Metadata.AUTHOR));
+ assertTrue(xmlResult.xml.contains("Sample Word Document"));
+
+ }
+
+ /**
+ * Test the plain text output of the Word converter
+ *
+ * @throws Exception
+ */
+ @Test
+ public void testWordFootnote() throws Exception {
+ XMLResult xmlResult = getXML("footnotes.docx", parseContext);
+ assertEquals(
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ xmlResult.metadata.get(Metadata.CONTENT_TYPE));
+ assertTrue(xmlResult.xml.contains("snoska"));
+
+ }
+
+ /**
+ * Test that the word converter is able to generate the
+ * correct HTML for the document
+ */
+ @Test
+ public void testWordHTML() throws Exception {
+ XMLResult result = getXML("testWORD.docx", parseContext);
+ String xml = result.xml;
+ Metadata metadata = result.metadata;
+ assertEquals(
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
+ assertTrue(xml.contains("Sample Word Document"));
+
+ // Check that custom headings came through
+//TODO: assertTrue(xml.contains("<h1 class=\"title\">"));
+
+ // Regular headings
+//TODO: assertTrue(xml.contains("<h1>Heading Level 1</h1>"));
+//TODO: assertTrue(xml.contains("<h2>Heading Level 2</h2>"));
+ // Headings with anchor tags in them
+//TODO: assertTrue(xml.contains("<h3><a name=\"OnLevel3\" />Heading Level 3</h3>"));
+ // Bold and italic
+ assertTrue(xml.contains("<b>BOLD</b>"));
+ assertTrue(xml.contains("<i>ITALIC</i>"));
+ // Table
+ assertTrue(xml.contains("<table>"));
+ assertTrue(xml.contains("<td>"));
+ // Links
+ assertTrue(xml.contains("<a href=\"http://tika.apache.org/\">Tika</a>"));
+ // Anchor links
+//TODO: assertContains("<a href=\"#OnMainHeading\">The Main Heading Bookmark</a>", xml);
+ // Paragraphs with other styles
+//TODO: assertTrue(xml.contains("<p class=\"signature\">This one"));
+
+ result = getXML("testWORD_3imgs.docx", parseContext);
+ xml = result.xml;
+
+ // Images 2-4 (there is no 1!)
+//TODO: assertTrue("Image not found in:\n" + xml, xml.contains("<img src=\"embedded:image2.png\" alt=\"A description...\" />"));
+//TODO: assertTrue("Image not found in:\n" + xml, xml.contains("<img src=\"embedded:image3.jpeg\" alt=\"A description...\" />"));
+//TODO: assertTrue("Image not found in:\n" + xml, xml.contains("<img src=\"embedded:image4.png\" alt=\"A description...\" />"));
+
+ // Text too
+ assertTrue(xml.contains("<p>The end!</p>"));
+
+ // TIKA-692: test document containing multiple
+ // character runs within a bold tag:
+ xml = getXML("testWORD_bold_character_runs.docx", parseContext).xml;
+
+ // Make sure bold text arrived as single
+ // contiguous string even though Word parser
+ // handled this as 3 character runs
+//TODO: assertTrue("Bold text wasn't contiguous: " + xml, xml.contains("F<b>oob</b>a<b>r</b>"));
+
+ // TIKA-692: test document containing multiple
+ // character runs within a bold tag:
+ xml = getXML("testWORD_bold_character_runs2.docx", parseContext).xml;
+
+ // Make sure bold text arrived as single
+ // contiguous string even though Word parser
+ // handled this as 3 character runs
+//TODO: assertTrue("Bold text wasn't contiguous: " + xml, xml.contains("F<b>oob</b>a<b>r</b>"));
+ }
+
+ /**
+ * Test that we can extract image from docx header
+ */
+ @Test
+ @Ignore("TODO")
+ public void testWordPicturesInHeader() throws Exception {
+ assertEquals(2, getRecursiveMetadata("headerPic.docx").size());
+ XMLResult xmlResult = getXML("headerPic.docx", parseContext);
+ assertEquals(
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ xmlResult.metadata.get(Metadata.CONTENT_TYPE));
+ // Check that custom headings came through
+ assertTrue(xmlResult.xml.contains("<img"));
+
+ }
+
+ /**
+ * Test docx without headers
+ * TIKA-633
+ */
+ @Test
+ public void testNullHeaders() throws Exception {
+ XMLResult xmlResult = getXML("NullHeader.docx", parseContext);
+ assertEquals("Should have found some text", false,
+ xmlResult.xml.isEmpty());
+
+ }
+
+ @Test
+ public void testVarious() throws Exception {
+ XMLResult xmlResult = getXML("testWORD_various.docx", parseContext);
+ String content = xmlResult.xml;
+ Metadata metadata = xmlResult.metadata;
+ //content = content.replaceAll("\\s+"," ");
+ assertContains("Footnote appears here", content);
+ assertContains("This is a footnote.", content);
+ assertContains("This is the header text.", content);
+ assertContains("This is the footer text.", content);
+ assertContains("Here is a text box", content);
+ assertContains("Bold", content);
+ assertContains("italic", content);
+ assertContains("underline", content);
+ assertContains("superscript", content);
+ assertContains("subscript", content);
+ assertContains("Here is a citation:", content);
+ assertContains("Figure 1 This is a caption for Figure 1", content);
+ assertContains("(Kramer)", content);
+//TODO: assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+", " "));
+//TODO: assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+", " "));
+ assertContains("This is a hyperlink", content);
+ assertContains("Here is a list:", content);
+ for (int row = 1; row <= 3; row++) {
+ //assertContains("�\tBullet " + row, content);
+ //assertContains("\u00b7\tBullet " + row, content);
+ assertContains("Bullet " + row, content);
+ }
+ assertContains("Here is a numbered list:", content);
+ for (int row = 1; row <= 3; row++) {
+ //assertContains(row + ")\tNumber bullet " + row, content);
+ //assertContains(row + ") Number bullet " + row, content);
+ // TODO: OOXMLExtractor fails to number the bullets:
+ assertContains("Number bullet " + row, content);
+ }
+
+ for (int row = 1; row <= 2; row++) {
+ for (int col = 1; col <= 3; col++) {
+ assertContains("Row " + row + " Col " + col, content);
+ }
+ }
+
+ assertContains("Keyword1 Keyword2", content);
+ assertEquals("Keyword1 Keyword2",
+ metadata.get(Metadata.KEYWORDS));
+
+ assertContains("Subject is here", content);
+ // TODO: Remove subject in Tika 2.0
+ assertEquals("Subject is here",
+ metadata.get(Metadata.SUBJECT));
+ assertEquals("Subject is here",
+ metadata.get(OfficeOpenXMLCore.SUBJECT));
+
+ assertContains("Suddenly some Japanese text:", content);
+ // Special version of (GHQ)
+ assertContains("\uff08\uff27\uff28\uff31\uff09", content);
+ // 6 other characters
+ assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f", content);
+
+ assertContains("And then some Gothic text:", content);
+ assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content);
+ }
+
+ @Test
+ public void testWordCustomProperties() throws Exception {
+ Metadata metadata = new Metadata();
+
+ try (InputStream input = OOXMLParserTest.class.getResourceAsStream(
+ "/test-documents/testWORD_custom_props.docx")) {
+ ContentHandler handler = new BodyContentHandler(-1);
+ ParseContext context = new ParseContext();
+ context.set(Locale.class, Locale.US);
+ new OOXMLParser().parse(input, handler, metadata, context);
+ }
+
+ assertEquals(
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("EJ04325S", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Etienne Jouvin", metadata.get(TikaCoreProperties.MODIFIER));
+ assertEquals("Etienne Jouvin", metadata.get(Metadata.LAST_AUTHOR));
+ assertEquals("2011-07-29T16:52:00Z", metadata.get(TikaCoreProperties.CREATED));
+ assertEquals("2011-07-29T16:52:00Z", metadata.get(Metadata.CREATION_DATE));
+ assertEquals("2012-01-03T22:14:00Z", metadata.get(TikaCoreProperties.MODIFIED));
+ assertEquals("2012-01-03T22:14:00Z", metadata.get(Metadata.DATE));
+ assertEquals("Microsoft Office Word", metadata.get(Metadata.APPLICATION_NAME));
+ assertEquals("Microsoft Office Word", metadata.get(OfficeOpenXMLExtended.APPLICATION));
+ assertEquals("1", metadata.get(Office.PAGE_COUNT));
+ assertEquals("2", metadata.get(Office.WORD_COUNT));
+ assertEquals("My Title", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("My Keyword", metadata.get(TikaCoreProperties.KEYWORDS));
+ assertEquals("Normal.dotm", metadata.get(Metadata.TEMPLATE));
+ assertEquals("Normal.dotm", metadata.get(OfficeOpenXMLExtended.TEMPLATE));
+ // TODO: Remove subject in Tika 2.0
+ assertEquals("My subject", metadata.get(Metadata.SUBJECT));
+ assertEquals("My subject", metadata.get(OfficeOpenXMLCore.SUBJECT));
+ assertEquals("EDF-DIT", metadata.get(TikaCoreProperties.PUBLISHER));
+ assertEquals("true", metadata.get("custom:myCustomBoolean"));
+ assertEquals("3", metadata.get("custom:myCustomNumber"));
+ assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
+ assertEquals("2010-12-30T23:00:00Z", metadata.get("custom:MyCustomDate"));
+ assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate"));
+ }
+
+ // TIKA-989:
+ @Test
+ @Ignore("TODO")
+ public void testEmbeddedPDF() throws Exception {
+ String xml = getXML("testWORD_embedded_pdf.docx", parseContext).xml;
+ int i = xml.indexOf("Here is the pdf file:");
+ int j = xml.indexOf("<div class=\"embedded\" id=\"rId5\"/>");
+ int k = xml.indexOf("Bye Bye");
+ int l = xml.indexOf("<div class=\"embedded\" id=\"rId6\"/>");
+ int m = xml.indexOf("Bye for real.");
+ assertTrue(i != -1);
+ assertTrue(j != -1);
+ assertTrue(k != -1);
+ assertTrue(l != -1);
+ assertTrue(m != -1);
+ assertTrue(i < j);
+ assertTrue(j < k);
+ assertTrue(k < l);
+ assertTrue(l < m);
+ }
+
+ // TIKA-1006
+ @Test
+ public void testWordNullStyle() throws Exception {
+ String xml = getXML("testWORD_null_style.docx").xml;
+ assertContains("Test av styrt dokument", xml);
+ }
+
+ /**
+ * TIKA-1044 - Handle word documents where parts of the
+ * text have no formatting or styles applied to them
+ */
+ @Test
+ public void testNoFormat() throws Exception {
+ assertContains("This is a piece of text that causes an exception",
+ getXML("testWORD_no_format.docx", parseContext).xml);
+ }
+
+ @Test
+ public void testSkipDeleted() throws Exception {
+ ParseContext pc = new ParseContext();
+ OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+ officeParserConfig.setIncludeDeletedContent(true);
+ officeParserConfig.setUseSAXDocxExtractor(true);
+ officeParserConfig.setIncludeMoveFromContent(true);
+ pc.set(OfficeParserConfig.class, officeParserConfig);
+
+ XMLResult r = getXML("testWORD_2006ml.docx", pc);
+ assertContains("frog", r.xml);
+ assertContainsCount("Second paragraph", r.xml, 2);
+
+ }
+
+ // TIKA-1005:
+ @Test
+ public void testTextInsideTextBox() throws Exception {
+ String xml = getXML("testWORD_text_box.docx", parseContext).xml;
+ assertContains("This text is directly in the body of the document.", xml);
+ assertContains("This text is inside of a text box in the body of the document.", xml);
+ assertContains("This text is inside of a text box in the header of the document.", xml);
+ assertContains("This text is inside of a text box in the footer of the document.", xml);
+ }
+
+ /**
+ * Test for missing text described in
+ * <a href="https://issues.apache.org/jira/browse/TIKA-1130">TIKA-1130</a>.
+ * and TIKA-1317
+ */
+ @Test
+ public void testMissingText() throws Exception {
+
+ XMLResult xmlResult = getXML("testWORD_missing_text.docx", parseContext);
+ assertEquals(
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ xmlResult.metadata.get(Metadata.CONTENT_TYPE));
+ assertContains("BigCompany", xmlResult.xml);
+ assertContains("Seasoned", xmlResult.xml);
+ assertContains("Rich_text_in_cell", xmlResult.xml);
+
+ }
+
+ //TIKA-792; with room for future missing bean tests
+ @Test
+ public void testWordMissingOOXMLBeans() throws Exception {
+ //If a bean is missing, POI prints stack trace to stderr
+ String[] fileNames = new String[]{
+ "testWORD_missing_ooxml_bean1.docx",//TIKA-792
+ };
+ PrintStream origErr = System.err;
+ for (String fileName : fileNames) {
+
+ //grab stderr
+ ByteArrayOutputStream errContent = new ByteArrayOutputStream();
+ System.setErr(new PrintStream(errContent, true, UTF_8.name()));
+ getXML(fileName, parseContext);
+
+ //return stderr
+ System.setErr(origErr);
+
+ String err = errContent.toString(UTF_8.name());
+ assertTrue(err.length() == 0);
+ }
+ }
+
+ @Test
+ public void testDOCXThumbnail() throws Exception {
+ String xml = getXML("testDOCX_Thumbnail.docx", parseContext).xml;
+ int a = xml.indexOf("This file contains a thumbnail");
+ int b = xml.indexOf("<div class=\"embedded\" id=\"/docProps/thumbnail.emf\" />");
+
+ assertTrue(a != -1);
+ assertTrue(b != -1);
+ assertTrue(a < b);
+ }
+
+ @Test
+ public void testEncrypted() throws Exception {
+ Map<String, String> tests = new HashMap<String, String>();
+ tests.put("testWORD_protected_passtika.docx",
+ "This is an encrypted Word 2007 File");
+
+ Parser parser = new AutoDetectParser();
+ Metadata m = new Metadata();
+ PasswordProvider passwordProvider = new PasswordProvider() {
+ @Override
+ public String getPassword(Metadata metadata) {
+ return "tika";
+ }
+ };
+
+ OfficeParserConfig opc = new OfficeParserConfig();
+ opc.setUseSAXDocxExtractor(true);
+ ParseContext passwordContext = new ParseContext();
+ passwordContext.set(org.apache.tika.parser.PasswordProvider.class, passwordProvider);
+ passwordContext.set(OfficeParserConfig.class, opc);
+ for (Map.Entry<String, String> e : tests.entrySet()) {
+ assertContains(e.getValue(), getXML(e.getKey(), passwordContext).xml);
+ }
+
+ //now try with no password
+ for (Map.Entry<String, String> e : tests.entrySet()) {
+ boolean exc = false;
+ try {
+ getXML(e.getKey(), parseContext);
+ } catch (EncryptedDocumentException ex) {
+ exc = true;
+ }
+ assertTrue(exc);
+ }
+
+ }
+
+ @Test
+ public void testDOCXParagraphNumbering() throws Exception {
+ String xml = getXML("testWORD_numbered_list.docx", parseContext).xml;
+ //SAX parser is getting this. DOM parser is not
+ assertContains("add a list here", xml);
+/*TODO:
+ assertContains("1) This", xml);
+ assertContains("a) Is", xml);
+ assertContains("i) A multi", xml);
+ assertContains("ii) Level", xml);
+ assertContains("1. Within cell 1", xml);
+ assertContains("b. Cell b", xml);
+ assertContains("iii) List", xml);
+ assertContains("2) foo", xml);
+ assertContains("ii) baz", xml);
+ assertContains("ii) foo", xml);
+ assertContains("II. bar", xml);
+ assertContains("6. six", xml);
+ assertContains("7. seven", xml);
+ assertContains("a. seven a", xml);
+ assertContains("e. seven e", xml);
+ assertContains("2. A ii 2", xml);
+ assertContains("3. page break list 3", xml);
+ assertContains("Some-1-CrazyFormat Greek numbering with crazy format - alpha", xml);
+ assertContains("1.1.1. 1.1.1", xml);
+ assertContains("1.1. 1.2->1.1 //set the value", xml);
+*/
+ }
+
+ @Test
+ @Ignore("TODO")
+ public void testDOCXOverrideParagraphNumbering() throws Exception {
+ String xml = getXML("testWORD_override_list_numbering.docx").xml;
+
+ //Test 1
+ assertContains("<p>1.1.1.1...1 1.1.1.1...1</p>", xml);
+ assertContains("1st.2.3someText 1st.2.3someText", xml);
+ assertContains("1st.2.2someOtherText.1 1st.2.2someOtherText.1", xml);
+ assertContains("5th 5th", xml);
+
+
+ //Test 2
+ assertContains("1.a.I 1.a.I", xml);
+ //test no reset because level 2 is not sufficient to reset
+ assertContains("<p>1.b.III 1.b.III</p>", xml);
+ //test restarted because of level 0's increment to 2
+ assertContains("2.a.I 2.a.I", xml);
+ //test handling of skipped level
+ assertContains("<p>2.b 2.b</p>", xml);
+
+ //Test 3
+ assertContains("(1)) (1))", xml);
+ //tests start level 1 at 17 and
+ assertContains("2.17 2.17", xml);
+ //tests that isLegal turns everything into decimal
+ assertContains("2.18.2.1 2.18.2.1", xml);
+ assertContains("<p>2 2</p>", xml);
+
+ //Test4
+ assertContains("<p>1 1</p>", xml);
+ assertContains("<p>A A</p>", xml);
+ assertContains("<p>B B</p>", xml);
+ //this tests overrides
+ assertContains("<p>C C</p>", xml);
+ assertContains("<p>4 4</p>", xml);
+
+ //Test5
+ assertContains(">00 00", xml);
+ assertContains(">01 01", xml);
+ assertContains(">01. 01.", xml);
+ assertContains(">01..1 01..1", xml);
+ assertContains(">02 02", xml);
+ }
+
+ @Test
+ public void testMultiAuthorsManagers() throws Exception {
+ XMLResult r = getXML("testWORD_multi_authors.docx", parseContext);
+ String[] authors = r.metadata.getValues(TikaCoreProperties.CREATOR);
+ assertEquals(3, authors.length);
+ assertEquals("author2", authors[1]);
+
+ String[] managers = r.metadata.getValues(OfficeOpenXMLExtended.MANAGER);
+ assertEquals(2, managers.length);
+ assertEquals("manager1", managers[0]);
+ assertEquals("manager2", managers[1]);
+ }
+
+ @Test
+ public void testOrigSourcePath() throws Exception {
+ Metadata embed1_zip_metadata = getRecursiveMetadata("test_recursive_embedded.docx", parseContext).get(11);
+ assertContains("C:\\Users\\tallison\\AppData\\Local\\Temp\\embed1.zip",
+ Arrays.asList(embed1_zip_metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME)));
+ assertContains("C:\\Users\\tallison\\Desktop\\tmp\\New folder (2)\\embed1.zip",
+ Arrays.asList(embed1_zip_metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME)));
+ }
+
+ @Test
+ @Ignore("TODO")
+ public void testBoldHyperlink() throws Exception {
+ //TIKA-1255
+ String xml = getXML("testWORD_boldHyperlink.docx", parseContext).xml;
+ xml = xml.replaceAll("\\s+", " ");
+ assertContains("<a href=\"http://tika.apache.org/\">hyper <b>link</b></a>", xml);
+ assertContains("<a href=\"http://tika.apache.org/\"><b>hyper</b> link</a>; bold", xml);
+ }
+
+ @Test
+ public void testLongForIntExceptionInSummaryDetails() throws Exception {
+ //TIKA-2055
+ assertContains("bold", getXML("testWORD_totalTimeOutOfRange.docx", parseContext).xml);
+ }
+
+ @Test
+ @Ignore("TODO")
+ public void testMacrosInDocm() throws Exception {
+ Metadata minExpected = new Metadata();
+ minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
+ minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
+ minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
+ minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+ TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
+
+ assertContainsAtLeast(minExpected, getRecursiveMetadata("testWORD_macros.docm", parseContext));
+ }
+
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/89430130/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/SXWPFExtractorTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/SXWPFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/SXWPFExtractorTest.java
deleted file mode 100644
index 06f0eed..0000000
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/SXWPFExtractorTest.java
+++ /dev/null
@@ -1,187 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.microsoft.ooxml.xwpf;
-
-import static org.junit.Assert.assertEquals;
-
-import java.util.List;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Office;
-import org.apache.tika.metadata.OfficeOpenXMLCore;
-import org.apache.tika.metadata.OfficeOpenXMLExtended;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.RecursiveParserWrapper;
-import org.apache.tika.parser.microsoft.OfficeParserConfig;
-import org.junit.Test;
-
-
-public class SXWPFExtractorTest extends TikaTest {
-
- @Test
- public void basicTest() throws Exception {
- ParseContext pc = new ParseContext();
- OfficeParserConfig officeParserConfig = new OfficeParserConfig();
- officeParserConfig.setUseSAXDocxExtractor(true);
-
- pc.set(OfficeParserConfig.class, officeParserConfig);
- List<Metadata> metadataList = getRecursiveMetadata("testWORD_2006ml.docx", pc);
-
- assertEquals(8, metadataList.size());
- Metadata m = metadataList.get(0);
-
- assertEquals("2016-11-29T00:58:00Z", m.get(TikaCoreProperties.CREATED));
- assertEquals("2016-11-29T17:54:00Z", m.get(TikaCoreProperties.MODIFIED));
- assertEquals("My Document Title", m.get(TikaCoreProperties.TITLE));
- assertEquals("This is the Author", m.get(TikaCoreProperties.CREATOR));
- assertEquals("3", m.get(OfficeOpenXMLCore.REVISION));
- assertEquals("Allison, Timothy B.", m.get(TikaCoreProperties.MODIFIER));
- //assertEquals("0", m.get(OfficeOpenXMLExtended.DOC_SECURITY));
- assertEquals("260", m.get(Office.WORD_COUNT));
- assertEquals("3", m.get(Office.PARAGRAPH_COUNT));
- assertEquals("1742", m.get(Office.CHARACTER_COUNT_WITH_SPACES));
- assertEquals("12", m.get(Office.LINE_COUNT));
- assertEquals("16.0000", m.get(OfficeOpenXMLExtended.APP_VERSION));
-
-
- String content = m.get(RecursiveParserWrapper.TIKA_CONTENT);
-
-
- assertContainsCountTimes("engaging title page", content, 1);
- assertContainsCountTimes("This is the Author", content, 1);
- assertContainsCountTimes("This is an engaging title page", content, 1);
-
- assertContains("My Document Title", content);
- assertContains("My Document Subtitle", content);
-
- assertContains("<p>\tHeading1\t3</p>", content);
-
-
- //TODO: integrate numbering
- assertContains("Really basic 2.", content);
-
- assertContainsCountTimes("This is a text box", content, 1);
-
- assertContains("<p>This is a hyperlink: <a href=\"http://tika.apache.org\">tika</a></p>", content);
-
- assertContains("<p>This is a link to a local file: <a href=\"file:///C:/data/test.png\">test.png</a></p>", content);
-
- assertContains("<p>This is 10 spaces</p>", content);
-
- //caption
- assertContains("<p>Table 1: Table1 Caption</p>", content);
-
- //embedded table
- //TODO: figure out how to handle embedded tables in html
- assertContains("<p>Embedded table r1c1</p>", content);
-
- //shape
- assertContainsCountTimes("<p>This is text within a shape", content, 1);
-
- //sdt rich text
- assertContains("<p>Rich text content control", content);
-
- //sdt simple text
- assertContains("<p>Simple text content control", content);
-
- //sdt repeating
- assertContains("Repeating content", content);
-
- //sdt dropdown
- //TODO: get options for dropdown
- assertContains("Drop down1", content);
-
- //sdt date
- assertContains("<p>11/16/2016</p>", content);
-
- //test that <tab/> works
- assertContains("tab\ttab", content);
-
- assertContainsCountTimes("serious word art", content, 1);
- assertContainsCountTimes("Wordartr1c1", content, 1);
-
- //glossary document contents
- assertContains("Click or tap to enter a date", content);
-
- //basic formatting
- assertContains("<p>The <i>quick</i> brown <b>fox </b>j<i>um</i><b><i>ped</i></b> over",
- content);
-
- //TODO: add chart parsing
-// assertContains("This is the chart", content);
-
- assertContains("This is a comment", content);
-
- assertContains("This is an endnote", content);
-
- assertContains("this is the footnote", content);
-
- assertContains("First page header", content);
-
- assertContains("Even page header", content);
-
- assertContains("Odd page header", content);
-
- assertContains("First page footer", content);
-
- assertContains("Even page footer", content);
-
- assertContains("Odd page footer", content);
-
- //test default does not include deleted
- assertNotContained("frog", content);
-
- assertContains("Mattmann", content);
-
- //TODO: extract chart text
-// assertContains("This is the chart title", content);
-
- }
-
- @Test
- public void testSkipDeleted() throws Exception {
- ParseContext pc = new ParseContext();
- OfficeParserConfig officeParserConfig = new OfficeParserConfig();
- officeParserConfig.setIncludeDeletedContent(true);
- officeParserConfig.setUseSAXDocxExtractor(true);
- officeParserConfig.setIncludeMoveFromContent(true);
- pc.set(OfficeParserConfig.class, officeParserConfig);
-
- XMLResult r = getXML("testWORD_2006ml.docx", pc);
- assertContains("frog", r.xml);
- assertContainsCount("Second paragraph", r.xml, 2);
-
- }
-
- private void assertContainsCountTimes(String needle, String haystack, int expectedCount) {
- int i = haystack.indexOf("engaging title page");
- int cnt = 0;
- while (i > -1) {
- cnt++;
- i = haystack.indexOf("engaging title page", i+1);
- }
- assertEquals("found needle >"+ needle+"<"+cnt+" times instead of expected: "+expectedCount,
- expectedCount, cnt);
-
- }
-
-
-
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/89430130/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/tika-config-sax-docx.xml
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/tika-config-sax-docx.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/tika-config-sax-docx.xml
new file mode 100644
index 0000000..cad9c5a
--- /dev/null
+++ b/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/tika-config-sax-docx.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser"/>
+ <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser">
+ <params>
+ <param name="useSAXDocxExtractor" type="bool">true</param>
+ </params>
+ </parser>
+ </parsers>
+</properties>
[7/7] tika git commit: update changes for TIKA-2191 and TIKA-2192
Posted by ta...@apache.org.
update changes for TIKA-2191 and TIKA-2192
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/5425d02a
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/5425d02a
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/5425d02a
Branch: refs/heads/master
Commit: 5425d02a1ed97ce5f884a076f55ad8197cc6ac7b
Parents: 615bf75
Author: tballison <ta...@mitre.org>
Authored: Tue Dec 6 09:06:27 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Tue Dec 6 09:06:27 2016 -0500
----------------------------------------------------------------------
CHANGES.txt | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/5425d02a/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 8a97cd3..2dd9181 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
Release 1.15 - ??
+ * Enabled extraction of embedded objects from headers, footers,
+ footnotes, endnotes and comments in legacy .docx parser (TIKA-2192).
+
* Allow extraction of PDActions (including Javascript) from
PDFs (TIKA-2090).
@@ -7,7 +10,7 @@ Release 1.15 - ??
deleted text to align with .doc (TIKA-2187).
* Added experimental SAX parser for .docx files. To select this parser,
- set useSAXDocxExtractor(true) on OfficeParserConfig (TIKA-1321).
+ set useSAXDocxExtractor(true) on OfficeParserConfig (TIKA-1321, TIKA-2191).
* Change default behavior to parse embedded documents even if the user
forgets to specify a Parser.class in the ParseContext (TIKA-2096).
[2/7] tika git commit: TIKA-2191 -- step2 -- add handling for docm
files...extract macros
Posted by ta...@apache.org.
TIKA-2191 -- step2 -- add handling for docm files...extract macros
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/f93d4e1f
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/f93d4e1f
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/f93d4e1f
Branch: refs/heads/master
Commit: f93d4e1fffdb4a441f7fa750a43691adfa70c392
Parents: 8943013
Author: tballison <ta...@mitre.org>
Authored: Mon Dec 5 11:14:34 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Tue Dec 6 09:01:00 2016 -0500
----------------------------------------------------------------------
.../ooxml/SXWPFWordExtractorDecorator.java | 15 +++++--
.../microsoft/ooxml/SXWPFExtractorTest.java | 42 +++++++++++---------
2 files changed, 35 insertions(+), 22 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/f93d4e1f/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
index e08dab1..ee88f15 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
@@ -72,7 +72,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
protected void buildXHTML(XHTMLContentHandler xhtml)
throws SAXException, XmlException, IOException {
//handle main document
- List<PackagePart> pps = opcPackage.getPartsByContentType(XWPFRelation.DOCUMENT.getContentType());
+ List<PackagePart> pps = getMainDocumentParts();
if (pps != null) {
for (PackagePart pp : pps) {
//likely only one, but why not...
@@ -81,7 +81,6 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
}
//handle glossary document
pps = opcPackage.getPartsByContentType(XWPFRelation.GLOSSARY_DOCUMENT.getContentType());
-
if (pps != null) {
for (PackagePart pp : pps) {
//likely only one, but why not...
@@ -145,7 +144,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
new XWPFTikaBodyPartHandler(xhtml, xwpfListManager,
context.get(OfficeParserConfig.class)), hyperlinks))));
} catch (TikaException e) {
- e.printStackTrace();
+ //swallow
}
}
@@ -217,6 +216,14 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
*/
@Override
protected List<PackagePart> getMainDocumentParts() {
- return opcPackage.getPartsByContentType(XWPFRelation.DOCUMENT.getContentType());
+ //figure out which one this is
+ List<PackagePart> pps = opcPackage.getPartsByContentType(XWPFRelation.DOCUMENT.getContentType());
+ if (pps.size() == 0) {
+ pps = opcPackage.getPartsByContentType(XWPFRelation.MACRO_DOCUMENT.getContentType());
+ if (pps.size() == 0) {
+ pps = opcPackage.getPartsByContentType(XWPFRelation.MACRO_TEMPLATE_DOCUMENT.getContentType());
+ }
+ }
+ return pps;
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/f93d4e1f/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
index fb7a977..dffa112 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
@@ -148,9 +148,6 @@ public class SXWPFExtractorTest extends TikaTest {
assertContains("<p>The <i>quick</i> brown <b>fox </b>j<i>um</i><b><i>ped</i></b> over",
content);
- //TODO: add chart parsing
-// assertContains("This is the chart", content);
-
assertContains("This is a comment", content);
assertContains("This is an endnote", content);
@@ -177,6 +174,9 @@ public class SXWPFExtractorTest extends TikaTest {
//TODO: extract chart text
// assertContains("This is the chart title", content);
+ //TODO: add chart parsing
+// assertContains("This is the chart", content);
+
}
/**
@@ -261,15 +261,18 @@ public class SXWPFExtractorTest extends TikaTest {
// Text too
assertTrue(xml.contains("<p>The end!</p>"));
+ }
+ @Test
+ public void testContiguousHTMLFormatting() throws Exception {
// TIKA-692: test document containing multiple
// character runs within a bold tag:
- xml = getXML("testWORD_bold_character_runs.docx", parseContext).xml;
+ String xml = getXML("testWORD_bold_character_runs.docx", parseContext).xml;
// Make sure bold text arrived as single
// contiguous string even though Word parser
// handled this as 3 character runs
-//TODO: assertTrue("Bold text wasn't contiguous: " + xml, xml.contains("F<b>oob</b>a<b>r</b>"));
+ assertTrue("Bold text wasn't contiguous: " + xml, xml.contains("F<b>oob</b>a<b>r</b>"));
// TIKA-692: test document containing multiple
// character runs within a bold tag:
@@ -278,7 +281,7 @@ public class SXWPFExtractorTest extends TikaTest {
// Make sure bold text arrived as single
// contiguous string even though Word parser
// handled this as 3 character runs
-//TODO: assertTrue("Bold text wasn't contiguous: " + xml, xml.contains("F<b>oob</b>a<b>r</b>"));
+ assertTrue("Bold text wasn't contiguous: " + xml, xml.contains("F<b>oob</b>a<b>r</b>"));
}
/**
@@ -311,9 +314,9 @@ public class SXWPFExtractorTest extends TikaTest {
@Test
public void testVarious() throws Exception {
- XMLResult xmlResult = getXML("testWORD_various.docx", parseContext);
- String content = xmlResult.xml;
- Metadata metadata = xmlResult.metadata;
+ Metadata metadata = new Metadata();
+ String content = getText(getResourceAsStream("/test-documents/testWORD_various.docx"),
+ new AutoDetectParser(), parseContext, metadata);
//content = content.replaceAll("\\s+"," ");
assertContains("Footnote appears here", content);
assertContains("This is a footnote.", content);
@@ -328,8 +331,8 @@ public class SXWPFExtractorTest extends TikaTest {
assertContains("Here is a citation:", content);
assertContains("Figure 1 This is a caption for Figure 1", content);
assertContains("(Kramer)", content);
-//TODO: assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+", " "));
-//TODO: assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+", " "));
+ assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+", " "));
+ assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+", " "));
assertContains("This is a hyperlink", content);
assertContains("Here is a list:", content);
for (int row = 1; row <= 3; row++) {
@@ -522,7 +525,7 @@ public class SXWPFExtractorTest extends TikaTest {
String xml = getXML("testDOCX_Thumbnail.docx", parseContext).xml;
int a = xml.indexOf("This file contains a thumbnail");
int b = xml.indexOf("<div class=\"embedded\" id=\"/docProps/thumbnail.emf\" />");
-
+ System.out.println(xml);
assertTrue(a != -1);
assertTrue(b != -1);
assertTrue(a < b);
@@ -566,11 +569,11 @@ public class SXWPFExtractorTest extends TikaTest {
}
@Test
+ @Ignore("TODO -- paragraph list numbers")
public void testDOCXParagraphNumbering() throws Exception {
String xml = getXML("testWORD_numbered_list.docx", parseContext).xml;
- //SAX parser is getting this. DOM parser is not
+ //SAX parser is getting this. DOM parser is not!
assertContains("add a list here", xml);
-/*TODO:
assertContains("1) This", xml);
assertContains("a) Is", xml);
assertContains("i) A multi", xml);
@@ -591,11 +594,11 @@ public class SXWPFExtractorTest extends TikaTest {
assertContains("Some-1-CrazyFormat Greek numbering with crazy format - alpha", xml);
assertContains("1.1.1. 1.1.1", xml);
assertContains("1.1. 1.2->1.1 //set the value", xml);
-*/
+
}
@Test
- @Ignore("TODO")
+ @Ignore("TODO -- paragraph list numbers")
public void testDOCXOverrideParagraphNumbering() throws Exception {
String xml = getXML("testWORD_override_list_numbering.docx").xml;
@@ -678,8 +681,11 @@ public class SXWPFExtractorTest extends TikaTest {
}
@Test
- @Ignore("TODO")
public void testMacrosInDocm() throws Exception {
+ List<Metadata> metadataList = getRecursiveMetadata("testWORD_macros.docm", parseContext);
+ //check that content came out of the .docm file
+ assertContains("quick", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT));
+
Metadata minExpected = new Metadata();
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
@@ -687,7 +693,7 @@ public class SXWPFExtractorTest extends TikaTest {
minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
- assertContainsAtLeast(minExpected, getRecursiveMetadata("testWORD_macros.docm", parseContext));
+ assertContainsAtLeast(minExpected, metadataList);//, parseContext));
}
[5/7] tika git commit: TIKA-2191 -- step 5 actually extract images
embedded in areas besides the body of docx/m
Posted by ta...@apache.org.
TIKA-2191 -- step 5 actually extract images embedded in areas besides the body of docx/m
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/4469ca2c
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/4469ca2c
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/4469ca2c
Branch: refs/heads/master
Commit: 4469ca2c4ea725e9f5d94c116aaf248deea2a6eb
Parents: 806eaf8
Author: tballison <ta...@mitre.org>
Authored: Tue Dec 6 08:43:59 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Tue Dec 6 09:01:48 2016 -0500
----------------------------------------------------------------------
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 11 +++-
.../ooxml/SXWPFWordExtractorDecorator.java | 47 ++++++++++++++--
.../ooxml/xwpf/XWPFDocumentXMLBodyHandler.java | 32 ++++++++---
.../parser/microsoft/ooxml/OOXMLParserTest.java | 54 ++++++++++++-------
.../microsoft/ooxml/SXWPFExtractorTest.java | 35 ++++++++++--
.../test-documents/testWORD_embedded_pics.docx | Bin 0 -> 52399 bytes
6 files changed, 142 insertions(+), 37 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/4469ca2c/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index f9ba8a6..6bc867d 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -22,7 +22,9 @@ import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
+import java.util.HashSet;
import java.util.List;
+import java.util.Set;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
@@ -158,10 +160,17 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
private void handleEmbeddedParts(ContentHandler handler)
throws TikaException, IOException, SAXException {
+ Set<String> seen = new HashSet<>();
try {
for (PackagePart source : getMainDocumentParts()) {
for (PackageRelationship rel : source.getRelationships()) {
-
+ URI targetURI = rel.getTargetURI();
+ if (targetURI != null) {
+ if (seen.contains(targetURI.toString())) {
+ continue;
+ }
+ seen.add(targetURI.toString());
+ }
URI sourceURI = rel.getSourceURI();
String sourceDesc;
if (sourceURI != null) {
http://git-wip-us.apache.org/repos/asf/tika/blob/4469ca2c/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
index 8634cd6..43fca3b 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
@@ -19,6 +19,7 @@ package org.apache.tika.parser.microsoft.ooxml;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
+import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@@ -59,13 +60,21 @@ import org.xml.sax.SAXException;
public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
private final static String[] EMBEDDED_RELATIONSHIPS = new String[]{
- RELATION_OLE_OBJECT,
RELATION_AUDIO,
RELATION_IMAGE,
RELATION_PACKAGE,
RELATION_OFFICE_DOCUMENT
};
+ //include all parts that might have embedded objects
+ private final static String[] MAIN_PART_RELATIONS = new String[]{
+ XWPFRelation.HEADER.getRelation(),
+ XWPFRelation.FOOTER.getRelation(),
+ XWPFRelation.FOOTNOTE.getRelation(),
+ "http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes",
+ "http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments"
+ };
+
private final OPCPackage opcPackage;
private final ParseContext context;
@@ -82,7 +91,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
protected void buildXHTML(XHTMLContentHandler xhtml)
throws SAXException, XmlException, IOException {
//handle main document
- List<PackagePart> pps = getMainDocumentParts();
+ List<PackagePart> pps = getStoryDocumentParts();
if (pps != null) {
for (PackagePart pp : pps) {
//likely only one, but why not...
@@ -176,6 +185,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
}
for (String rel : EMBEDDED_RELATIONSHIPS) {
+
prc = bodyPart.getRelationshipsByType(rel);
for (int i = 0; i < prc.size(); i++) {
PackageRelationship pr = prc.getRelationship(i);
@@ -247,11 +257,40 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
}
/**
- * This returns the main document only.
+ * This returns all items that might contain embedded objects:
+ * main document, headers, footers, comments, etc.
*/
@Override
protected List<PackagePart> getMainDocumentParts() {
- //figure out which one this is
+
+ List<PackagePart> mainStoryDocs = getStoryDocumentParts();
+ List<PackagePart> relatedParts = new ArrayList<>();
+
+ for (PackagePart pp : mainStoryDocs) {
+ addRelatedParts(pp, relatedParts);
+ }
+ relatedParts.addAll(mainStoryDocs);
+ return relatedParts;
+ }
+
+ private void addRelatedParts(PackagePart documentPart, List<PackagePart> relatedParts) {
+ for (String relation : MAIN_PART_RELATIONS) {
+ PackageRelationshipCollection prc = null;
+ try {
+ prc = documentPart.getRelationshipsByType(relation);
+ if (prc != null) {
+ for (int i = 0; i < prc.size(); i++) {
+ PackagePart packagePart = documentPart.getRelatedPart(prc.getRelationship(i));
+ relatedParts.add(packagePart);
+ }
+ }
+ } catch (InvalidFormatException e) {
+ }
+ }
+
+ }
+
+ private List<PackagePart> getStoryDocumentParts() {
List<PackagePart> pps = opcPackage.getPartsByContentType(XWPFRelation.DOCUMENT.getContentType());
if (pps.size() == 0) {
pps = opcPackage.getPartsByContentType(XWPFRelation.MACRO_DOCUMENT.getContentType());
http://git-wip-us.apache.org/repos/asf/tika/blob/4469ca2c/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
index 2538215..b2e74d1 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
@@ -34,6 +34,7 @@ import org.xml.sax.helpers.DefaultHandler;
public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
+
enum EditType {
NONE,
INSERT,
@@ -48,6 +49,7 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
private final static String O_NS = "urn:schemas-microsoft-com:office:office";
private final static String PIC_NS = "http://schemas.openxmlformats.org/drawingml/2006/picture";
private final static String DRAWING_MAIN_NS = "http://schemas.openxmlformats.org/drawingml/2006/main";
+ private static final String V_NS = "urn:schemas-microsoft-com:vml";
private final static String OFFICE_DOC_RELATIONSHIP_NS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships";
@@ -71,6 +73,7 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
private boolean inDelText = false;
private boolean inPic = false;
+ private boolean inPict = false;
private String picDescription = null;
private String picRId = null;
private String picFilename = null;
@@ -154,6 +157,13 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
}
}
+ if (uri == null || uri.equals(V_NS)) {
+ if ("imagedata".equals(localName)) {
+ picRId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
+ picDescription = atts.getValue(O_NS, "title");
+ }
+ }
+
if (uri == null || uri.equals(W_NS)) {
if (localName.equals("p")) {
bodyContentsHandler.startParagraph();
@@ -248,14 +258,9 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
if (PIC_NS.equals(uri)) {
if ("pic".equals(localName)) {
- String picFileName = null;
- if (picRId != null) {
- picFileName = linkedRelationships.get(picRId);
- }
- bodyContentsHandler.embeddedPicRef(picFileName, picDescription);
- picDescription = null;
- picRId = null;
+ handlePict();
inPic = false;
+ return;
}
}
@@ -291,10 +296,23 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
editType = EditType.NONE;
} else if (localName.equals("hyperlink")) {
bodyContentsHandler.hyperlinkEnd();
+ } else if ("pict".equals(localName)) {
+ handlePict();
}
}
}
+ private void handlePict() {
+ String picFileName = null;
+ if (picRId != null) {
+ picFileName = linkedRelationships.get(picRId);
+ }
+ bodyContentsHandler.embeddedPicRef(picFileName, picDescription);
+ picDescription = null;
+ picRId = null;
+ inPic = false;
+ }
+
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
http://git-wip-us.apache.org/repos/asf/tika/blob/4469ca2c/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 0059d09..a831006 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -468,30 +468,44 @@ public class OOXMLParserTest extends TikaTest {
* Test that we can extract image from docx header
*/
@Test
+ @Ignore("fix actual extraction")
public void testWordPicturesInHeader() throws Exception {
- Metadata metadata = new Metadata();
- ParseContext context = new ParseContext();
-
- StringWriter sw = new StringWriter();
- SAXTransformerFactory factory = (SAXTransformerFactory)
- SAXTransformerFactory.newInstance();
- TransformerHandler handler = factory.newTransformerHandler();
- handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
- handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
- handler.setResult(new StreamResult(sw));
+ List<Metadata> metadataList = getRecursiveMetadata("headerPic.docx");
+ assertEquals(2, metadataList.size());
+ Metadata m = metadataList.get(0);
+ String mainContent = m.get(RecursiveParserWrapper.TIKA_CONTENT);
+ assertEquals(
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ m.get(Metadata.CONTENT_TYPE));
+ // Check that custom headings came through
+ assertTrue(mainContent.contains("<img"));
+ }
- // Try with a document containing various tables and formattings
- try (InputStream input = getTestDocument("headerPic.docx")) {
- parser.parse(input, handler, metadata, context);
- String xml = sw.toString();
- assertEquals(
- "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
- metadata.get(Metadata.CONTENT_TYPE));
- // Check that custom headings came through
- assertTrue(xml.contains("<img"));
+ @Test
+ @Ignore("not currently extracting from non-body components")
+ public void testPicturesInVariousPlaces() throws Exception {
+ //test that images are actually extracted from
+ //headers, footers, comments, endnotes, footnotes
+ List<Metadata> metadataList = getRecursiveMetadata("testWORD_embedded_pics.docx");
+
+ //only process embedded resources once
+ assertEquals(3, metadataList.size());
+ String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
+ for (int i = 1; i < 4; i++) {
+ assertContains("header"+i+"_pic", content);
+ assertContains("footer"+i+"_pic", content);
}
+ assertContains("body_pic.jpg", content);
+ assertContains("sdt_pic.jpg", content);
+ assertContains("deeply_embedded_pic", content);
+ assertContains("deleted_pic", content);//TODO: don't extract this
+ assertContains("footnotes_pic", content);
+ assertContains("comments_pic", content);
+ assertContains("endnotes_pic", content);
+// assertContains("sdt2_pic.jpg", content);//name of file is not stored in image-sdt
+
+ assertContainsCount("<img src=", content, 14);
}
-
/**
* Documents with some sheets are protected, but not all.
* See TIKA-364.
http://git-wip-us.apache.org/repos/asf/tika/blob/4469ca2c/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
index f4a1aeb..6064be2 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
@@ -290,16 +290,41 @@ public class SXWPFExtractorTest extends TikaTest {
* Test that we can extract image from docx header
*/
@Test
- @Ignore("TODO")
public void testWordPicturesInHeader() throws Exception {
- assertEquals(2, getRecursiveMetadata("headerPic.docx").size());
- XMLResult xmlResult = getXML("headerPic.docx", parseContext);
+ List<Metadata> metadataList = getRecursiveMetadata("headerPic.docx", parseContext);
+ assertEquals(2, metadataList.size());
+ Metadata m = metadataList.get(0);
+ String mainContent = m.get(RecursiveParserWrapper.TIKA_CONTENT);
assertEquals(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
- xmlResult.metadata.get(Metadata.CONTENT_TYPE));
+ m.get(Metadata.CONTENT_TYPE));
// Check that custom headings came through
- assertTrue(xmlResult.xml.contains("<img"));
+ assertTrue(mainContent.contains("<img"));
+ }
+ @Test
+ public void testPicturesInVariousPlaces() throws Exception {
+ //test that images are actually extracted from
+ //headers, footers, comments, endnotes, footnotes
+ List<Metadata> metadataList = getRecursiveMetadata("testWORD_embedded_pics.docx", parseContext);
+
+ //only process embedded resources once
+ assertEquals(3, metadataList.size());
+ String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
+ for (int i = 1; i < 4; i++) {
+ assertContains("header"+i+"_pic", content);
+ assertContains("footer"+i+"_pic", content);
+ }
+ assertContains("body_pic.jpg", content);
+ assertContains("sdt_pic.jpg", content);
+ assertContains("deeply_embedded_pic", content);
+ assertContains("deleted_pic", content);//TODO: don't extract this
+ assertContains("footnotes_pic", content);
+ assertContains("comments_pic", content);
+ assertContains("endnotes_pic", content);
+// assertContains("sdt2_pic.jpg", content);//name of file is not stored in image-sdt
+
+ assertContainsCount("<img src=", content, 14);
}
/**
http://git-wip-us.apache.org/repos/asf/tika/blob/4469ca2c/tika-parsers/src/test/resources/test-documents/testWORD_embedded_pics.docx
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testWORD_embedded_pics.docx b/tika-parsers/src/test/resources/test-documents/testWORD_embedded_pics.docx
new file mode 100644
index 0000000..1a63e6f
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testWORD_embedded_pics.docx differ
[3/7] tika git commit: TIKA-2191 -- step 3 -- clean up and
tag handling
Posted by ta...@apache.org.
TIKA-2191 -- step 3 -- clean up <b> and <i> tag handling
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/1aca10a2
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/1aca10a2
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/1aca10a2
Branch: refs/heads/master
Commit: 1aca10a26dada02a045a1bc9eb7c3cfc1b36a83e
Parents: f93d4e1
Author: tballison <ta...@mitre.org>
Authored: Mon Dec 5 12:17:56 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Tue Dec 6 09:01:16 2016 -0500
----------------------------------------------------------------------
.../ooxml/xwpf/XWPFDocumentXMLBodyHandler.java | 19 ++----
.../ooxml/xwpf/XWPFEventBasedWordExtractor.java | 9 ++-
.../ooxml/xwpf/XWPFTikaBodyPartHandler.java | 71 +++++++++++++++-----
.../microsoft/ooxml/SXWPFExtractorTest.java | 4 +-
4 files changed, 70 insertions(+), 33 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/1aca10a2/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
index dce36a2..9e5ce6b 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
@@ -65,7 +65,6 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
private boolean inRPr = false;
private boolean inNumPr = false;
private boolean inDelText = false;
- private boolean inHyperlink = false;
//alternate content can be embedded in itself.
//need to track depth.
@@ -73,7 +72,6 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
private int inACChoiceDepth = 0;
private int inACFallbackDepth = 0;
private EditType editType = EditType.NONE;
- private String hyperlink = null;
private XWPFRunProperties currRunProperties = new XWPFRunProperties();
@@ -151,10 +149,11 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
startEditedSection(editType.MOVE_FROM, atts);
} else if (localName.equals("hyperlink")) {
String hyperlinkId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
+ String hyperlink = null;
if (hyperlinkId != null) {
hyperlink = hyperlinks.get(hyperlinkId);
}
- inHyperlink = true;
+ bodyContentsHandler.hyperlinkStart(hyperlink);
} else if (localName.equals("footnoteReference")) {
String id = atts.getValue(W_NS, "id");
bodyContentsHandler.footnoteReference(id);
@@ -210,7 +209,7 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
}
- if (localName.equals("r") && !inHyperlink) {
+ if (localName.equals("r")) {
bodyContentsHandler.run(currRunProperties, runBuffer.toString());
inR = false;
runBuffer.setLength(0);
@@ -235,13 +234,7 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
localName.equals("moveTo") || localName.equals("moveFrom")) {
editType = EditType.NONE;
} else if (localName.equals("hyperlink")) {
- if (hyperlink != null) {
- bodyContentsHandler.hyperlinkRun(hyperlink, runBuffer.toString());
- } else {
- bodyContentsHandler.run(currRunProperties, runBuffer.toString());
- }
- runBuffer.setLength(0);
- inHyperlink = false;
+ bodyContentsHandler.hyperlinkEnd();
}
}
}
@@ -281,7 +274,9 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
void run(XWPFRunProperties runProperties, String contents);
- void hyperlinkRun(String link, String text);
+ void hyperlinkStart(String link);
+
+ void hyperlinkEnd();
void startParagraph();
http://git-wip-us.apache.org/repos/asf/tika/blob/1aca10a2/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
index 06ef951..4ee7a4f 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
@@ -265,8 +265,13 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
}
@Override
- public void hyperlinkRun(String link, String text) {
- buffer.append(" (").append(text).append(") ");
+ public void hyperlinkStart(String link) {
+ //no-op
+ }
+
+ @Override
+ public void hyperlinkEnd() {
+ //no-op
}
@Override
http://git-wip-us.apache.org/repos/asf/tika/blob/1aca10a2/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
index 2f27739..d62e270 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
@@ -38,6 +38,7 @@ public class XWPFTikaBodyPartHandler implements XWPFDocumentXMLBodyHandler.XWPFB
private int pDepth = 0; //paragraph depth
private boolean isItalics = false;
private boolean isBold = false;
+ private boolean wroteHyperlinkStart = false;
public XWPFTikaBodyPartHandler(XHTMLContentHandler xhtml, XWPFListManager listManager, OfficeParserConfig parserConfig) {
this.xhtml = xhtml;
@@ -48,37 +49,57 @@ public class XWPFTikaBodyPartHandler implements XWPFDocumentXMLBodyHandler.XWPFB
@Override
public void run(XWPFRunProperties runProperties, String contents) {
- //TODO: smooth out bold/italics to handle only changes
- //If two runs are bold, only add <b> at beginning and end of the run pair
try {
- if (runProperties.getBold()) {
- xhtml.startElement("b");
+ // True if we are currently in the named style tag:
+ if (runProperties.getBold() != isBold) {
+ if (isItalics) {
+ xhtml.endElement("i");
+ isItalics = false;
+ }
+ if (runProperties.getBold()) {
+ xhtml.startElement("b");
+ isBold = true;
+ } else {
+ xhtml.endElement("b");
+ isBold = false;
+ }
}
- if (runProperties.getItalics()) {
- xhtml.startElement("i");
+
+ if (runProperties.getItalics() != isItalics) {
+ if (runProperties.getItalics()) {
+ xhtml.startElement("i");
+ isItalics = true;
+ } else {
+ xhtml.endElement("i");
+ isItalics = false;
+ }
}
xhtml.characters(contents);
- if (runProperties.getItalics()) {
- xhtml.endElement("i");
- }
- if (runProperties.getBold()) {
- xhtml.endElement("b");
- }
+
} catch (SAXException e) {
}
}
@Override
- public void hyperlinkRun(String link, String text) {
- //System.out.println("tika handler: "+link + " :: "+text);
+ public void hyperlinkStart(String link) {
try {
if (link != null) {
xhtml.startElement("a", "href", link);
+ wroteHyperlinkStart = true;
}
- xhtml.characters(text);
- if (link != null) {
+ } catch (SAXException e) {
+
+ }
+ }
+
+ @Override
+ public void hyperlinkEnd() {
+ try {
+ if (wroteHyperlinkStart) {
+ closeStyleTags();
+ wroteHyperlinkStart = false;
xhtml.endElement("a");
}
} catch (SAXException e) {
@@ -101,6 +122,7 @@ public class XWPFTikaBodyPartHandler implements XWPFDocumentXMLBodyHandler.XWPFB
@Override
public void endParagraph() {
try {
+ closeStyleTags();
if (pDepth == 1) {
xhtml.endElement("p");
} else {
@@ -168,7 +190,11 @@ public class XWPFTikaBodyPartHandler implements XWPFDocumentXMLBodyHandler.XWPFB
@Override
public void startSDT() {
- //no-op
+ try {
+ closeStyleTags();
+ } catch (SAXException e) {
+
+ }
}
@Override
@@ -221,4 +247,15 @@ public class XWPFTikaBodyPartHandler implements XWPFDocumentXMLBodyHandler.XWPFB
public boolean getIncludeMoveFromText() {
return includeMoveFromText;
}
+
+ private void closeStyleTags() throws SAXException {
+ if (isItalics) {
+ xhtml.endElement("i");
+ isItalics = false;
+ }
+ if (isBold) {
+ xhtml.endElement("b");
+ isBold = false;
+ }
+ }
}
http://git-wip-us.apache.org/repos/asf/tika/blob/1aca10a2/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
index dffa112..22e5644 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
@@ -144,7 +144,7 @@ public class SXWPFExtractorTest extends TikaTest {
//glossary document contents
assertContains("Click or tap to enter a date", content);
- //basic formatting
+ //basic b/i tags...make sure not to overlap!
assertContains("<p>The <i>quick</i> brown <b>fox </b>j<i>um</i><b><i>ped</i></b> over",
content);
@@ -665,10 +665,10 @@ public class SXWPFExtractorTest extends TikaTest {
}
@Test
- @Ignore("TODO")
public void testBoldHyperlink() throws Exception {
//TIKA-1255
String xml = getXML("testWORD_boldHyperlink.docx", parseContext).xml;
+ System.out.println(xml);
xml = xml.replaceAll("\\s+", " ");
assertContains("<a href=\"http://tika.apache.org/\">hyper <b>link</b></a>", xml);
assertContains("<a href=\"http://tika.apache.org/\"><b>hyper</b> link</a>; bold", xml);
[4/7] tika git commit: TIKA-2191 -- step 4-- add markup for embedded
pics
Posted by ta...@apache.org.
TIKA-2191 -- step 4-- add markup for embedded pics
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/806eaf8b
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/806eaf8b
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/806eaf8b
Branch: refs/heads/master
Commit: 806eaf8b1802a3a3071a5ae0bdc35c20d6739280
Parents: 1aca10a
Author: tballison <ta...@mitre.org>
Authored: Mon Dec 5 13:28:27 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Tue Dec 6 09:01:34 2016 -0500
----------------------------------------------------------------------
.../ooxml/SXWPFWordExtractorDecorator.java | 47 ++++++++++++--
.../ooxml/xwpf/XWPFDocumentXMLBodyHandler.java | 66 +++++++++++++++++++-
.../ooxml/xwpf/XWPFEventBasedWordExtractor.java | 10 +++
.../ooxml/xwpf/XWPFTikaBodyPartHandler.java | 38 +++++++++++
.../microsoft/ooxml/SXWPFExtractorTest.java | 44 ++++++++++++-
5 files changed, 193 insertions(+), 12 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/806eaf8b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
index ee88f15..8634cd6 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
@@ -16,6 +16,7 @@
*/
package org.apache.tika.parser.microsoft.ooxml;
+import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
@@ -29,6 +30,7 @@ import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackageRelationship;
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
+import org.apache.poi.openxml4j.opc.internal.FileHelper;
import org.apache.poi.xwpf.usermodel.XWPFNumbering;
import org.apache.poi.xwpf.usermodel.XWPFRelation;
import org.apache.tika.exception.TikaException;
@@ -56,10 +58,18 @@ import org.xml.sax.SAXException;
*/
public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
+ private final static String[] EMBEDDED_RELATIONSHIPS = new String[]{
+ RELATION_OLE_OBJECT,
+ RELATION_AUDIO,
+ RELATION_IMAGE,
+ RELATION_PACKAGE,
+ RELATION_OFFICE_DOCUMENT
+ };
private final OPCPackage opcPackage;
private final ParseContext context;
+
public SXWPFWordExtractorDecorator(ParseContext context,
XWPFEventBasedWordExtractor extractor) {
super(context, extractor);
@@ -135,22 +145,22 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
private void handlePart(PackagePart packagePart,
XWPFListManager xwpfListManager, XHTMLContentHandler xhtml) throws IOException, SAXException {
- Map<String, String> hyperlinks = loadHyperlinkRelationships(packagePart);
+ Map<String, String> linkedRelationships = loadLinkedRelationships(packagePart);
try (InputStream stream = packagePart.getInputStream()) {
context.getSAXParser().parse(
new CloseShieldInputStream(stream),
new OfflineContentHandler(new EmbeddedContentHandler(
new XWPFDocumentXMLBodyHandler(
new XWPFTikaBodyPartHandler(xhtml, xwpfListManager,
- context.get(OfficeParserConfig.class)), hyperlinks))));
+ context.get(OfficeParserConfig.class)), linkedRelationships))));
} catch (TikaException e) {
//swallow
}
}
- private Map<String, String> loadHyperlinkRelationships(PackagePart bodyPart) {
- Map<String, String> hyperlinks = new HashMap<>();
+ private Map<String, String> loadLinkedRelationships(PackagePart bodyPart) {
+ Map<String, String> linkedRelationships = new HashMap<>();
try {
PackageRelationshipCollection prc = bodyPart.getRelationshipsByType(XWPFRelation.HYPERLINK.getRelation());
for (int i = 0; i < prc.size(); i++) {
@@ -161,12 +171,37 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
String id = pr.getId();
String url = (pr.getTargetURI() == null) ? null : pr.getTargetURI().toString();
if (id != null && url != null) {
- hyperlinks.put(id, url);
+ linkedRelationships.put(id, url);
+ }
+ }
+
+ for (String rel : EMBEDDED_RELATIONSHIPS) {
+ prc = bodyPart.getRelationshipsByType(rel);
+ for (int i = 0; i < prc.size(); i++) {
+ PackageRelationship pr = prc.getRelationship(i);
+ if (pr == null) {
+ continue;
+ }
+ String id = pr.getId();
+ String uriString = (pr.getTargetURI() == null) ? null : pr.getTargetURI().toString();
+ String fileName = uriString;
+ if (pr.getTargetURI() != null) {
+ try {
+ fileName = FileHelper.getFilename(new File(fileName));
+ } catch (Exception e) {
+ fileName = uriString;
+ }
+ }
+ if (id != null) {
+ fileName = (fileName == null) ? "" : fileName;
+ linkedRelationships.put(id, fileName);
+ }
}
}
+
} catch (InvalidFormatException e) {
}
- return hyperlinks;
+ return linkedRelationships;
}
/*
private XWPFStyles loadStyles(PackagePart packagePart) {
http://git-wip-us.apache.org/repos/asf/tika/blob/806eaf8b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
index 9e5ce6b..2538215 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
@@ -45,6 +45,10 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
private final static String W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
private final static String MC_NS = "http://schemas.openxmlformats.org/markup-compatibility/2006";
+ private final static String O_NS = "urn:schemas-microsoft-com:office:office";
+ private final static String PIC_NS = "http://schemas.openxmlformats.org/drawingml/2006/picture";
+ private final static String DRAWING_MAIN_NS = "http://schemas.openxmlformats.org/drawingml/2006/main";
+
private final static String OFFICE_DOC_RELATIONSHIP_NS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships";
private final static char[] TAB = new char[1];
@@ -55,7 +59,7 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
private final XWPFBodyContentsHandler bodyContentsHandler;
//private final RelationshipsManager relationshipsManager;
- private final Map<String, String> hyperlinks;
+ private final Map<String, String> linkedRelationships;
private final StringBuilder runBuffer = new StringBuilder();
@@ -66,6 +70,11 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
private boolean inNumPr = false;
private boolean inDelText = false;
+ private boolean inPic = false;
+ private String picDescription = null;
+ private String picRId = null;
+ private String picFilename = null;
+
//alternate content can be embedded in itself.
//need to track depth.
//if in alternate, choose fallback, maybe make this configurable?
@@ -78,7 +87,7 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
public XWPFDocumentXMLBodyHandler(XWPFBodyContentsHandler bodyContentsHandler,
Map<String, String> hyperlinks) {
this.bodyContentsHandler = bodyContentsHandler;
- this.hyperlinks = hyperlinks;
+ this.linkedRelationships = hyperlinks;
}
@@ -111,6 +120,39 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
if (inACChoiceDepth > 0) {
return;
}
+ if (uri == null || uri.equals(O_NS)) {
+ if (localName.equals("OLEObject")) {
+ String type = null;
+ String refId = null;
+ //TODO: want to get ProgID?
+ for (int i = 0; i < atts.getLength(); i++) {
+ String attLocalName = atts.getLocalName(i);
+ String attValue = atts.getValue(i);
+ if (attLocalName.equals("Type")) {
+ type = attValue;
+ } else if (OFFICE_DOC_RELATIONSHIP_NS.equals(atts.getURI(i)) && attLocalName.equals("id")) {
+ refId = attValue;
+ }
+ }
+ if ("Embed".equals(type)) {
+ bodyContentsHandler.embeddedOLERef(refId);
+ }
+ }
+ }
+
+ if (uri == null || uri.equals(PIC_NS)) {
+ if ("pic".equals(localName)) {
+ inPic = true;
+ } else if ("cNvPr".equals(localName)) {
+ picDescription = atts.getValue("", "descr");
+ }
+ }
+
+ if (uri == null || uri.equals(DRAWING_MAIN_NS)) {
+ if ("blip".equals(localName)) {
+ picRId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "embed");
+ }
+ }
if (uri == null || uri.equals(W_NS)) {
if (localName.equals("p")) {
@@ -151,7 +193,7 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
String hyperlinkId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
String hyperlink = null;
if (hyperlinkId != null) {
- hyperlink = hyperlinks.get(hyperlinkId);
+ hyperlink = linkedRelationships.get(hyperlinkId);
}
bodyContentsHandler.hyperlinkStart(hyperlink);
} else if (localName.equals("footnoteReference")) {
@@ -203,6 +245,20 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
inACFallbackDepth--;
}
}
+
+ if (PIC_NS.equals(uri)) {
+ if ("pic".equals(localName)) {
+ String picFileName = null;
+ if (picRId != null) {
+ picFileName = linkedRelationships.get(picRId);
+ }
+ bodyContentsHandler.embeddedPicRef(picFileName, picDescription);
+ picDescription = null;
+ picRId = null;
+ inPic = false;
+ }
+
+ }
if (uri == null || uri.equals(W_NS)) {
if (inACChoiceDepth > 0) {
return;
@@ -309,5 +365,9 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
void endnoteReference(String id);
boolean getIncludeMoveFromText();
+
+ void embeddedOLERef(String refId);
+
+ void embeddedPicRef(String picFileName, String picDescription);
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/806eaf8b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
index 4ee7a4f..ee6bb85 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
@@ -353,6 +353,16 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
public boolean getIncludeMoveFromText() {
return false;
}
+
+ @Override
+ public void embeddedOLERef(String refId) {
+ //no-op
+ }
+
+ @Override
+ public void embeddedPicRef(String picFileName, String picDescription) {
+ //no-op
+ }
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/806eaf8b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
index d62e270..cd28583 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
@@ -24,6 +24,7 @@ import org.apache.tika.parser.microsoft.OfficeParserConfig;
import org.apache.tika.parser.microsoft.ooxml.XWPFListManager;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
public class XWPFTikaBodyPartHandler implements XWPFDocumentXMLBodyHandler.XWPFBodyContentsHandler {
@@ -248,6 +249,43 @@ public class XWPFTikaBodyPartHandler implements XWPFDocumentXMLBodyHandler.XWPFB
return includeMoveFromText;
}
+ @Override
+ public void embeddedOLERef(String relId) {
+ if (relId == null) {
+ return;
+ }
+ try {
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+ attributes.addAttribute("", "id", "id", "CDATA", relId);
+ xhtml.startElement("div", attributes);
+ xhtml.endElement("div");
+
+ } catch (SAXException e) {
+
+ }
+ }
+
+ @Override
+ public void embeddedPicRef(String picFileName, String picDescription) {
+
+ try {
+ AttributesImpl attr = new AttributesImpl();
+ if (picFileName != null) {
+ attr.addAttribute("", "src", "src", "CDATA", "embedded:" + picFileName);
+ }
+ if (picDescription != null) {
+ attr.addAttribute("", "alt", "alt", "CDATA", picDescription);
+ }
+
+ xhtml.startElement("img", attr);
+ xhtml.endElement("img");
+
+ } catch (SAXException e) {
+
+ }
+ }
+
private void closeStyleTags() throws SAXException {
if (isItalics) {
xhtml.endElement("i");
http://git-wip-us.apache.org/repos/asf/tika/blob/806eaf8b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
index 22e5644..f4a1aeb 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
@@ -22,6 +22,7 @@ import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.io.ByteArrayOutputStream;
+import java.io.File;
import java.io.InputStream;
import java.io.PrintStream;
import java.util.Arrays;
@@ -38,6 +39,7 @@ import org.apache.tika.metadata.OfficeOpenXMLCore;
import org.apache.tika.metadata.OfficeOpenXMLExtended;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.PasswordProvider;
@@ -418,13 +420,13 @@ public class SXWPFExtractorTest extends TikaTest {
// TIKA-989:
@Test
- @Ignore("TODO")
public void testEmbeddedPDF() throws Exception {
String xml = getXML("testWORD_embedded_pdf.docx", parseContext).xml;
+ System.out.println(xml);
int i = xml.indexOf("Here is the pdf file:");
- int j = xml.indexOf("<div class=\"embedded\" id=\"rId5\"/>");
+ int j = xml.indexOf("<div class=\"embedded\" id=\"rId5\" />");
int k = xml.indexOf("Bye Bye");
- int l = xml.indexOf("<div class=\"embedded\" id=\"rId6\"/>");
+ int l = xml.indexOf("<div class=\"embedded\" id=\"rId6\" />");
int m = xml.indexOf("Bye for real.");
assertTrue(i != -1);
assertTrue(j != -1);
@@ -696,5 +698,41 @@ public class SXWPFExtractorTest extends TikaTest {
assertContainsAtLeast(minExpected, metadataList);//, parseContext));
}
+ @Test
+ public void testEmbedded() throws Exception {
+ List<Metadata> metadataList = getRecursiveMetadata("testWORD_embeded.docx", parseContext);
+ Metadata main = metadataList.get(0);
+ String content = main.get(RecursiveParserWrapper.TIKA_CONTENT);
+ //make sure mark up is there
+ assertContains("<img src=\"embedded:image2.jpeg\" alt=\"A description...\" />",
+ content);
+
+ assertContains("<div class=\"embedded\" id=\"rId8\" />",
+ content);
+
+ assertEquals(16, metadataList.size());
+ }
+
+ @Test
+ public void iterate() throws Exception {
+ ParseContext context = new ParseContext();
+ context.set(Parser.class, EmptyParser.INSTANCE);
+ for (File f : getResourceAsFile("/test-documents").listFiles()) {
+ if (! f.getName().equals("testWORD_embeded.docx")) {
+ continue;
+ }
+ if (f.getName().endsWith("docx") || f.getName().endsWith(".docm")) {
+ try {
+ XMLResult r = getXML(f.getName(), context);
+ if (r.xml.contains("<img")) {
+ System.out.println(f.getName());
+ }
+ System.out.println(r.xml);
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+ }
+ }
}
[6/7] tika git commit: TIKA-2192 - add extraction of embedded objects
in DOM docx parser from more than just main document
Posted by ta...@apache.org.
TIKA-2192 - add extraction of embedded objects in DOM docx parser from more than just main document
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/615bf75f
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/615bf75f
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/615bf75f
Branch: refs/heads/master
Commit: 615bf75fc11e8fc299be550b8cd4bb24f45a264a
Parents: 4469ca2
Author: tballison <ta...@mitre.org>
Authored: Tue Dec 6 09:04:51 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Tue Dec 6 09:04:51 2016 -0500
----------------------------------------------------------------------
.../ooxml/XWPFWordExtractorDecorator.java | 35 ++++++++++++++++++--
.../parser/microsoft/ooxml/OOXMLParserTest.java | 3 +-
2 files changed, 34 insertions(+), 4 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/615bf75f/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
index ccbf45e..a9eb93f 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
@@ -21,7 +21,9 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
+import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
@@ -38,6 +40,7 @@ import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFPicture;
import org.apache.poi.xwpf.usermodel.XWPFPictureData;
+import org.apache.poi.xwpf.usermodel.XWPFRelation;
import org.apache.poi.xwpf.usermodel.XWPFRun;
import org.apache.poi.xwpf.usermodel.XWPFSDT;
import org.apache.poi.xwpf.usermodel.XWPFSDTCell;
@@ -66,6 +69,16 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
private static final String LIST_DELIMITER = " ";
+ //include all parts that might have embedded objects
+ private final static String[] MAIN_PART_RELATIONS = new String[]{
+ XWPFRelation.HEADER.getRelation(),
+ XWPFRelation.FOOTER.getRelation(),
+ XWPFRelation.FOOTNOTE.getRelation(),
+ "http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes",
+ "http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments"
+ };
+
+
private XWPFDocument document;
private XWPFStyles styles;
@@ -438,16 +451,34 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
}
/**
- * Word documents are simple, they only have the one
- * main part
+ * Include main body and anything else that can
+ * have an attachment/embedded object
*/
@Override
protected List<PackagePart> getMainDocumentParts() {
List<PackagePart> parts = new ArrayList<PackagePart>();
parts.add(document.getPackagePart());
+ addRelatedParts(document.getPackagePart(), parts);
return parts;
}
+ private void addRelatedParts(PackagePart documentPart, List<PackagePart> relatedParts) {
+ for (String relation : MAIN_PART_RELATIONS) {
+ PackageRelationshipCollection prc = null;
+ try {
+ prc = documentPart.getRelationshipsByType(relation);
+ if (prc != null) {
+ for (int i = 0; i < prc.size(); i++) {
+ PackagePart packagePart = documentPart.getRelatedPart(prc.getRelationship(i));
+ relatedParts.add(packagePart);
+ }
+ }
+ } catch (InvalidFormatException e) {
+ }
+ }
+
+ }
+
private class TmpFormatting {
private boolean bold = false;
private boolean italic = false;
http://git-wip-us.apache.org/repos/asf/tika/blob/615bf75f/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index a831006..e84f6d0 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -468,7 +468,6 @@ public class OOXMLParserTest extends TikaTest {
* Test that we can extract image from docx header
*/
@Test
- @Ignore("fix actual extraction")
public void testWordPicturesInHeader() throws Exception {
List<Metadata> metadataList = getRecursiveMetadata("headerPic.docx");
assertEquals(2, metadataList.size());
@@ -482,7 +481,7 @@ public class OOXMLParserTest extends TikaTest {
}
@Test
- @Ignore("not currently extracting from non-body components")
+ @Ignore("need to add links in xhtml")
public void testPicturesInVariousPlaces() throws Exception {
//test that images are actually extracted from
//headers, footers, comments, endnotes, footnotes