You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/12/12 12:20:59 UTC

tika git commit: TIKA-2191: fixes after regression testing on TIKA_1302 corpus: 1) add 'cr' and 'br' and 2) add 'template' to potential main story body parts

Repository: tika
Updated Branches:
  refs/heads/master 192e3caa7 -> faf6c2b24


TIKA-2191: fixes after regression testing on TIKA_1302 corpus: 1) add 'cr' and 'br' and 2) add 'template' to potential main story body parts


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/faf6c2b2
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/faf6c2b2
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/faf6c2b2

Branch: refs/heads/master
Commit: faf6c2b24814ded27f05388f8a417c2df5bf5c7a
Parents: 192e3ca
Author: tballison <ta...@mitre.org>
Authored: Mon Dec 12 07:20:43 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Mon Dec 12 07:20:43 2016 -0500

----------------------------------------------------------------------
 .../ooxml/SXWPFWordExtractorDecorator.java      |  42 +++++++++++++++----
 .../ooxml/xwpf/XWPFDocumentXMLBodyHandler.java  |   6 ++-
 .../microsoft/ooxml/SXWPFExtractorTest.java     |  17 ++++++++
 .../test-documents/testWORD_template.dotx       | Bin 0 -> 59583 bytes
 4 files changed, 55 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/faf6c2b2/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
index 7c20a07..b97b690 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
@@ -78,6 +78,15 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
             "http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments"
     };
 
+    //a docx file should have one of these "main story" parts
+    private final static String[] MAIN_STORY_PART_RELATIONS = new String[]{
+            XWPFRelation.DOCUMENT.getContentType(),
+            XWPFRelation.MACRO_DOCUMENT.getContentType(),
+            XWPFRelation.TEMPLATE.getContentType(),
+            XWPFRelation.MACRO_TEMPLATE_DOCUMENT.getContentType()
+
+    };
+
     private final OPCPackage opcPackage;
     private final ParseContext context;
 
@@ -104,9 +113,14 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
         //handle glossary document
         pps = opcPackage.getPartsByContentType(XWPFRelation.GLOSSARY_DOCUMENT.getContentType());
         if (pps != null) {
-            for (PackagePart pp : pps) {
-                //likely only one, but why not...
-                handleDocumentPart(pp, xhtml);
+            if (pps.size() > 0) {
+                xhtml.startElement("div", "class", "glossary");
+
+                for (PackagePart pp : pps) {
+                    //likely only one, but why not...
+                    handleDocumentPart(pp, xhtml);
+                }
+                xhtml.endElement("div");
             }
         }
     }
@@ -270,6 +284,11 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
         List<PackagePart> mainStoryDocs = getStoryDocumentParts();
         List<PackagePart> relatedParts = new ArrayList<>();
 
+        mainStoryDocs.addAll(
+                opcPackage.getPartsByContentType(
+                        XWPFRelation.GLOSSARY_DOCUMENT.getContentType()));
+
+
         for (PackagePart pp : mainStoryDocs) {
             addRelatedParts(pp, relatedParts);
         }
@@ -294,14 +313,19 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
 
     }
 
+    /**
+     *
+     * @return the first non-empty main story document part; empty list if no
+     * main story is found.
+     */
     private List<PackagePart> getStoryDocumentParts() {
-        List<PackagePart> pps = opcPackage.getPartsByContentType(XWPFRelation.DOCUMENT.getContentType());
-        if (pps.size() == 0) {
-            pps = opcPackage.getPartsByContentType(XWPFRelation.MACRO_DOCUMENT.getContentType());
-            if (pps.size() == 0) {
-                pps = opcPackage.getPartsByContentType(XWPFRelation.MACRO_TEMPLATE_DOCUMENT.getContentType());
+
+        for (String contentType : MAIN_STORY_PART_RELATIONS) {
+            List<PackagePart> pps = opcPackage.getPartsByContentType(contentType);
+            if (pps.size() > 0) {
+                return pps;
             }
         }
-        return pps;
+        return new ArrayList<>();
     }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/faf6c2b2/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
index 92f4bfe..25621df 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
@@ -182,7 +182,11 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
                 inT = true;
             } else if (localName.equals("tab")) {
                 runBuffer.append("\t");
-            } else if (localName.equals("tbl")) {
+            } else if("br".equals(localName)) {
+                runBuffer.append("\n");
+            } else if("cr".equals(localName)) {
+                runBuffer.append("\n");
+            } else if(localName.equals("tbl")) {
                 bodyContentsHandler.startTable();
             } else if (localName.equals("tc")) {
                 bodyContentsHandler.startTableCell();

http://git-wip-us.apache.org/repos/asf/tika/blob/faf6c2b2/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
index 3fc53e2..d107756 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
@@ -730,4 +730,21 @@ public class SXWPFExtractorTest extends TikaTest {
         assertEquals(16, metadataList.size());
     }
 
+    @Test
+    public void testDotx() throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata("testWORD_template.docx", parseContext);
+        String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
+        assertContains("Metallica", content);
+        assertContains("Hetfield", content);
+        assertContains("one eye open", content);
+        assertContains("Getting the perfect", content);
+        //from glossary document
+        assertContains("table rows", content);
+
+        metadataList = getRecursiveMetadata("testWORD_template.dotx", parseContext);
+        content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
+        //from glossary document
+        assertContainsCount("ready to write", content, 2);
+    }
+
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/faf6c2b2/tika-parsers/src/test/resources/test-documents/testWORD_template.dotx
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testWORD_template.dotx b/tika-parsers/src/test/resources/test-documents/testWORD_template.dotx
new file mode 100644
index 0000000..5d24a78
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testWORD_template.dotx differ