You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/12/12 12:20:59 UTC
tika git commit: TIKA-2191: fixes after regression testing on
TIKA_1302 corpus: 1) add 'cr' and 'br' and 2) add 'template' to potential
main story body parts
Repository: tika
Updated Branches:
refs/heads/master 192e3caa7 -> faf6c2b24
TIKA-2191: fixes after regression testing on TIKA_1302 corpus: 1) add 'cr' and 'br' and 2) add 'template' to potential main story body parts
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/faf6c2b2
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/faf6c2b2
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/faf6c2b2
Branch: refs/heads/master
Commit: faf6c2b24814ded27f05388f8a417c2df5bf5c7a
Parents: 192e3ca
Author: tballison <ta...@mitre.org>
Authored: Mon Dec 12 07:20:43 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Mon Dec 12 07:20:43 2016 -0500
----------------------------------------------------------------------
.../ooxml/SXWPFWordExtractorDecorator.java | 42 +++++++++++++++----
.../ooxml/xwpf/XWPFDocumentXMLBodyHandler.java | 6 ++-
.../microsoft/ooxml/SXWPFExtractorTest.java | 17 ++++++++
.../test-documents/testWORD_template.dotx | Bin 0 -> 59583 bytes
4 files changed, 55 insertions(+), 10 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/faf6c2b2/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
index 7c20a07..b97b690 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
@@ -78,6 +78,15 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments"
};
+ //a docx file should have one of these "main story" parts
+ private final static String[] MAIN_STORY_PART_RELATIONS = new String[]{
+ XWPFRelation.DOCUMENT.getContentType(),
+ XWPFRelation.MACRO_DOCUMENT.getContentType(),
+ XWPFRelation.TEMPLATE.getContentType(),
+ XWPFRelation.MACRO_TEMPLATE_DOCUMENT.getContentType()
+
+ };
+
private final OPCPackage opcPackage;
private final ParseContext context;
@@ -104,9 +113,14 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
//handle glossary document
pps = opcPackage.getPartsByContentType(XWPFRelation.GLOSSARY_DOCUMENT.getContentType());
if (pps != null) {
- for (PackagePart pp : pps) {
- //likely only one, but why not...
- handleDocumentPart(pp, xhtml);
+ if (pps.size() > 0) {
+ xhtml.startElement("div", "class", "glossary");
+
+ for (PackagePart pp : pps) {
+ //likely only one, but why not...
+ handleDocumentPart(pp, xhtml);
+ }
+ xhtml.endElement("div");
}
}
}
@@ -270,6 +284,11 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
List<PackagePart> mainStoryDocs = getStoryDocumentParts();
List<PackagePart> relatedParts = new ArrayList<>();
+ mainStoryDocs.addAll(
+ opcPackage.getPartsByContentType(
+ XWPFRelation.GLOSSARY_DOCUMENT.getContentType()));
+
+
for (PackagePart pp : mainStoryDocs) {
addRelatedParts(pp, relatedParts);
}
@@ -294,14 +313,19 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
}
+ /**
+ *
+ * @return the first non-empty main story document part; empty list if no
+ * main story is found.
+ */
private List<PackagePart> getStoryDocumentParts() {
- List<PackagePart> pps = opcPackage.getPartsByContentType(XWPFRelation.DOCUMENT.getContentType());
- if (pps.size() == 0) {
- pps = opcPackage.getPartsByContentType(XWPFRelation.MACRO_DOCUMENT.getContentType());
- if (pps.size() == 0) {
- pps = opcPackage.getPartsByContentType(XWPFRelation.MACRO_TEMPLATE_DOCUMENT.getContentType());
+
+ for (String contentType : MAIN_STORY_PART_RELATIONS) {
+ List<PackagePart> pps = opcPackage.getPartsByContentType(contentType);
+ if (pps.size() > 0) {
+ return pps;
}
}
- return pps;
+ return new ArrayList<>();
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/faf6c2b2/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
index 92f4bfe..25621df 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
@@ -182,7 +182,11 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
inT = true;
} else if (localName.equals("tab")) {
runBuffer.append("\t");
- } else if (localName.equals("tbl")) {
+ } else if("br".equals(localName)) {
+ runBuffer.append("\n");
+ } else if("cr".equals(localName)) {
+ runBuffer.append("\n");
+ } else if(localName.equals("tbl")) {
bodyContentsHandler.startTable();
} else if (localName.equals("tc")) {
bodyContentsHandler.startTableCell();
http://git-wip-us.apache.org/repos/asf/tika/blob/faf6c2b2/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
index 3fc53e2..d107756 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
@@ -730,4 +730,21 @@ public class SXWPFExtractorTest extends TikaTest {
assertEquals(16, metadataList.size());
}
+ @Test
+ public void testDotx() throws Exception {
+ List<Metadata> metadataList = getRecursiveMetadata("testWORD_template.docx", parseContext);
+ String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
+ assertContains("Metallica", content);
+ assertContains("Hetfield", content);
+ assertContains("one eye open", content);
+ assertContains("Getting the perfect", content);
+ //from glossary document
+ assertContains("table rows", content);
+
+ metadataList = getRecursiveMetadata("testWORD_template.dotx", parseContext);
+ content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
+ //from glossary document
+ assertContainsCount("ready to write", content, 2);
+ }
+
}
http://git-wip-us.apache.org/repos/asf/tika/blob/faf6c2b2/tika-parsers/src/test/resources/test-documents/testWORD_template.dotx
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testWORD_template.dotx b/tika-parsers/src/test/resources/test-documents/testWORD_template.dotx
new file mode 100644
index 0000000..5d24a78
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testWORD_template.dotx differ