You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/12/06 14:06:49 UTC
[6/7] tika git commit: TIKA-2192 - add extraction of embedded objects
in DOM docx parser from more than just main document
TIKA-2192 - add extraction of embedded objects in DOM docx parser from more than just main document
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/615bf75f
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/615bf75f
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/615bf75f
Branch: refs/heads/master
Commit: 615bf75fc11e8fc299be550b8cd4bb24f45a264a
Parents: 4469ca2
Author: tballison <ta...@mitre.org>
Authored: Tue Dec 6 09:04:51 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Tue Dec 6 09:04:51 2016 -0500
----------------------------------------------------------------------
.../ooxml/XWPFWordExtractorDecorator.java | 35 ++++++++++++++++++--
.../parser/microsoft/ooxml/OOXMLParserTest.java | 3 +-
2 files changed, 34 insertions(+), 4 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/615bf75f/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
index ccbf45e..a9eb93f 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
@@ -21,7 +21,9 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
+import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
@@ -38,6 +40,7 @@ import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFPicture;
import org.apache.poi.xwpf.usermodel.XWPFPictureData;
+import org.apache.poi.xwpf.usermodel.XWPFRelation;
import org.apache.poi.xwpf.usermodel.XWPFRun;
import org.apache.poi.xwpf.usermodel.XWPFSDT;
import org.apache.poi.xwpf.usermodel.XWPFSDTCell;
@@ -66,6 +69,16 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
private static final String LIST_DELIMITER = " ";
+ //include all parts that might have embedded objects
+ private final static String[] MAIN_PART_RELATIONS = new String[]{
+ XWPFRelation.HEADER.getRelation(),
+ XWPFRelation.FOOTER.getRelation(),
+ XWPFRelation.FOOTNOTE.getRelation(),
+ "http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes",
+ "http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments"
+ };
+
+
private XWPFDocument document;
private XWPFStyles styles;
@@ -438,16 +451,34 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
}
/**
- * Word documents are simple, they only have the one
- * main part
+ * Include main body and anything else that can
+ * have an attachment/embedded object
*/
@Override
protected List<PackagePart> getMainDocumentParts() {
List<PackagePart> parts = new ArrayList<PackagePart>();
parts.add(document.getPackagePart());
+ addRelatedParts(document.getPackagePart(), parts);
return parts;
}
+ private void addRelatedParts(PackagePart documentPart, List<PackagePart> relatedParts) {
+ for (String relation : MAIN_PART_RELATIONS) {
+ PackageRelationshipCollection prc = null;
+ try {
+ prc = documentPart.getRelationshipsByType(relation);
+ if (prc != null) {
+ for (int i = 0; i < prc.size(); i++) {
+ PackagePart packagePart = documentPart.getRelatedPart(prc.getRelationship(i));
+ relatedParts.add(packagePart);
+ }
+ }
+ } catch (InvalidFormatException e) {
+ }
+ }
+
+ }
+
private class TmpFormatting {
private boolean bold = false;
private boolean italic = false;
http://git-wip-us.apache.org/repos/asf/tika/blob/615bf75f/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index a831006..e84f6d0 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -468,7 +468,6 @@ public class OOXMLParserTest extends TikaTest {
* Test that we can extract image from docx header
*/
@Test
- @Ignore("fix actual extraction")
public void testWordPicturesInHeader() throws Exception {
List<Metadata> metadataList = getRecursiveMetadata("headerPic.docx");
assertEquals(2, metadataList.size());
@@ -482,7 +481,7 @@ public class OOXMLParserTest extends TikaTest {
}
@Test
- @Ignore("not currently extracting from non-body components")
+ @Ignore("need to add links in xhtml")
public void testPicturesInVariousPlaces() throws Exception {
//test that images are actually extracted from
//headers, footers, comments, endnotes, footnotes