You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/04/28 13:52:13 UTC

[tika] branch master updated: TIKA-2346 -- add unit tests and configurability for doc, xls and SAX docx parser.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

The following commit(s) were added to refs/heads/master by this push:
       new  562e4fa   TIKA-2346 -- add unit tests and configurability for doc, xls and SAX docx parser.
562e4fa is described below

commit 562e4faf95bc85b3ab2290055860d3e075a66451
Author: tballison <ta...@mitre.org>
AuthorDate: Fri Apr 28 09:52:01 2017 -0400

    TIKA-2346 -- add unit tests and configurability for doc, xls and SAX docx parser.
---
 .../parser/microsoft/AbstractPOIFSExtractor.java   |   2 ++
 .../tika/parser/microsoft/ExcelExtractor.java      |   6 +++--
 .../tika/parser/microsoft/WordExtractor.java       |  12 +++++-----
 .../ooxml/OOXMLWordAndPowerPointTextHandler.java   |  26 ++++++++++++++++++++-
 .../ooxml/SXWPFWordExtractorDecorator.java         |   2 +-
 .../tika/parser/microsoft/ExcelParserTest.java     |  16 +++++++++++++
 .../tika/parser/microsoft/WordParserTest.java      |  11 +++++++++
 .../parser/microsoft/ooxml/OOXMLParserTest.java    |  26 +++++++++++++++++++++
 .../parser/microsoft/ooxml/SXWPFExtractorTest.java |  15 ++++++++++++
 .../resources/test-documents/testEXCEL_textbox.xls | Bin 0 -> 26112 bytes
 10 files changed, 106 insertions(+), 10 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
index 9c85f30..3e658fd 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
@@ -49,6 +49,7 @@ abstract class AbstractPOIFSExtractor {
     private final EmbeddedDocumentUtil embeddedDocumentUtil;
     private PasswordProvider passwordProvider;
     protected final Metadata parentMetadata;//metadata of the parent/container document
+    protected final OfficeParserConfig officeParserConfig;
 
     protected AbstractPOIFSExtractor(ParseContext context) {
         this(context, null);
@@ -58,6 +59,7 @@ abstract class AbstractPOIFSExtractor {
         embeddedDocumentUtil = new EmbeddedDocumentUtil(context);
 
         this.passwordProvider = context.get(PasswordProvider.class);
+        this.officeParserConfig = context.get(OfficeParserConfig.class, new OfficeParserConfig());
         this.parentMetadata = parentMetadata;
     }
 
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
index 1f336d8..40f0b52 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
@@ -455,8 +455,10 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
                     break;
 
                 case TextObjectRecord.sid:
-                    TextObjectRecord tor = (TextObjectRecord) record;
-                    addTextCell(record, tor.getStr().getString());
+                    if (extractor.officeParserConfig.getIncludeShapeBasedContent()) {
+                        TextObjectRecord tor = (TextObjectRecord) record;
+                        addTextCell(record, tor.getStr().getString());
+                    }
                     break;
 
                 case SeriesTextRecord.sid: // Chart label or title
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
index 3180925..cc961f7 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
@@ -79,7 +79,6 @@ public class WordExtractor extends AbstractPOIFSExtractor {
         fixedParagraphStyles.put("HTML Preformatted", new TagAndStyle("pre", null));
     }
 
-    private final boolean extractDeletedContent;
     // True if we are currently in the named style tag:
     private boolean curStrikeThrough;
     private boolean curBold;
@@ -90,7 +89,6 @@ public class WordExtractor extends AbstractPOIFSExtractor {
     public WordExtractor(ParseContext context, Metadata metadata) {
         super(context);
         this.metadata = metadata;
-        extractDeletedContent = context.get(OfficeParserConfig.class).getIncludeDeletedContent();
     }
 
     private static int countParagraphs(Range... ranges) {
@@ -189,9 +187,11 @@ public class WordExtractor extends AbstractPOIFSExtractor {
             i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, pictures, pictureTable, listManager, xhtml);
         }
 
-        // Do everything else
-        for (String paragraph : wordExtractor.getMainTextboxText()) {
-            xhtml.element("p", paragraph);
+        if (officeParserConfig.getIncludeShapeBasedContent()) {
+            // Do everything else
+            for (String paragraph : wordExtractor.getMainTextboxText()) {
+                xhtml.element("p", paragraph);
+            }
         }
 
         for (String paragraph : wordExtractor.getFootnoteText()) {
@@ -668,7 +668,7 @@ public class WordExtractor extends AbstractPOIFSExtractor {
             return false;
         }
         return !cr.isMarkedDeleted() ||
-                (cr.isMarkedDeleted() && extractDeletedContent);
+                (cr.isMarkedDeleted() && officeParserConfig.getIncludeDeletedContent());
     }
 
     public static class TagAndStyle {
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
index b3a9f68..c430f7d 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
@@ -102,6 +102,8 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
     private final static String MOVE_FROM = "moveFrom";
     private final static String MOVE_TO = "moveTo";
     private final static String ENDNOTE_REFERENCE = "endnoteReference";
+    private static final String TEXTBOX = "textbox";
+
 
     private final XWPFBodyContentsHandler bodyContentsHandler;
 
@@ -135,20 +137,28 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
 
     private final RunProperties currRunProperties = new RunProperties();
     private final ParagraphProperties currPProperties = new ParagraphProperties();
-
+    private final boolean includeTextBox;
     private final StringBuilder runBuffer = new StringBuilder();
 
 
     private boolean inDelText = false;
     private boolean inHlinkClick = false;
+    private boolean inTextBox = false;
 
     private OOXMLWordAndPowerPointTextHandler.EditType editType = OOXMLWordAndPowerPointTextHandler.EditType.NONE;
 
     private DateUtils dateUtils = new DateUtils();
+
     public OOXMLWordAndPowerPointTextHandler(XWPFBodyContentsHandler bodyContentsHandler,
                                              Map<String, String> hyperlinks) {
+        this(bodyContentsHandler, hyperlinks, true);
+    }
+
+    public OOXMLWordAndPowerPointTextHandler(XWPFBodyContentsHandler bodyContentsHandler,
+                                             Map<String, String> hyperlinks, boolean includeTextBox) {
         this.bodyContentsHandler = bodyContentsHandler;
         this.linkedRelationships = hyperlinks;
+        this.includeTextBox = includeTextBox;
     }
 
 
@@ -189,6 +199,11 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
         if (inACChoiceDepth > 0) {
             return;
         }
+
+        if (! includeTextBox && localName.equals(TEXTBOX)) {
+            inTextBox = true;
+            return;
+        }
         //these are sorted descending by frequency within docx files
         //in our regression corpus.
         //yes, I know, likely premature optimization...
@@ -344,6 +359,10 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
             return;
         }
 
+        if (! includeTextBox && localName.equals(TEXTBOX)) {
+            inTextBox = false;
+            return;
+        }
         if (PIC.equals(localName)) { //PIC_NS
             handlePict();
             inPic = false;
@@ -417,7 +436,10 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
 
         if (inACChoiceDepth > 0) {
             return;
+        } else if (! includeTextBox && inTextBox) {
+            return;
         }
+
         if (editType.equals(EditType.MOVE_FROM) && inT) {
             if (bodyContentsHandler.getIncludeMoveFromText()) {
                 runBuffer.append(ch, start, length);
@@ -433,6 +455,8 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
     public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
         if (inACChoiceDepth > 0) {
             return;
+        } else if (! includeTextBox && inTextBox) {
+            return;
         }
 
         if (inT) {
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
index d923a2c..0ae4977 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
@@ -183,7 +183,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
                     new OfflineContentHandler(new EmbeddedContentHandler(
                             new OOXMLWordAndPowerPointTextHandler(
                                     new OOXMLTikaBodyPartHandler(xhtml, styles, listManager,
-                                            config), linkedRelationships))));
+                                            config), linkedRelationships, config.getIncludeShapeBasedContent()))));
         } catch (TikaException e) {
             metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
                     ExceptionUtils.getStackTrace(e));
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index fc31958..d9abdf8 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -472,4 +472,20 @@ public class ExcelParserTest extends TikaTest {
 
     }
 
+    @Test
+    public void testTextBox() throws Exception {
+        String xml = getXML("testEXCEL_textbox.xls").xml;
+        assertContains("autoshape", xml);
+    }
+
+    //TIKA-2346
+    @Test
+    public void testTurningOffTextBoxExtractionExcel() throws Exception {
+        ParseContext pc = new ParseContext();
+        OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+        officeParserConfig.setIncludeShapeBasedContent(false);
+        pc.set(OfficeParserConfig.class, officeParserConfig);
+        String xml = getXML("testEXCEL_textbox.xls", pc).xml;
+        assertNotContained("autoshape", xml);
+    }
 }
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index df6d807..f7036a8 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -196,6 +196,17 @@ public class WordParserTest extends TikaTest {
         }
     }
 
+    //TIKA-2346
+    @Test
+    public void testTurningOffTextBox() throws Exception {
+        ParseContext pc = new ParseContext();
+        OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+        officeParserConfig.setIncludeShapeBasedContent(false);
+        pc.set(OfficeParserConfig.class, officeParserConfig);
+        String xml = getXML("testWORD_various.doc", pc).xml;
+        assertNotContained("text box", xml);
+    }
+
     @Test
     public void testVarious() throws Exception {
         ContentHandler handler = new BodyContentHandler();
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 6420545..525913f 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -966,6 +966,20 @@ public class OOXMLParserTest extends TikaTest {
         assertContains("This text is inside of a text box in the footer of the document.", xml);
     }
 
+    //TIKA-2346
+    @Test
+    public void testTurningOffTextBoxExtraction() throws Exception {
+        ParseContext pc = new ParseContext();
+        OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+        officeParserConfig.setIncludeShapeBasedContent(false);
+        pc.set(OfficeParserConfig.class, officeParserConfig);
+        String xml = getXML("testWORD_text_box.docx", pc).xml;
+        assertContains("This text is directly in the body of the document.", xml);
+        assertNotContained("This text is inside of a text box in the body of the document.", xml);
+        assertNotContained("This text is inside of a text box in the header of the document.", xml);
+        assertNotContained("This text is inside of a text box in the footer of the document.", xml);
+    }
+
     // TIKA-1032:
     @Test
     public void testEmbeddedPPTXTwoSlides() throws Exception {
@@ -1003,6 +1017,18 @@ public class OOXMLParserTest extends TikaTest {
         assertContains("some autoshape", r.xml);
     }
 
+    //TIKA-2346
+    @Test
+    public void testTurningOffTextBoxExtractionExcel() throws Exception {
+
+        ParseContext pc = new ParseContext();
+        OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+        officeParserConfig.setIncludeShapeBasedContent(false);
+        pc.set(OfficeParserConfig.class, officeParserConfig);
+        String xml = getXML("testEXCEL_textbox.xlsx", pc).xml;
+        assertNotContained("autoshape", xml);
+    }
+
     //TIKA-792; with room for future missing bean tests
     @Test
     public void testWordMissingOOXMLBeans() throws Exception {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
index 206385a..f1e867a 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
@@ -504,6 +504,21 @@ public class SXWPFExtractorTest extends TikaTest {
         assertContains("This text is inside of a text box in the footer of the document.", xml);
     }
 
+    //TIKA-2346
+    @Test
+    public void testTurningOffTextBoxExtraction() throws Exception {
+        ParseContext pc = new ParseContext();
+        OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+        officeParserConfig.setIncludeShapeBasedContent(false);
+        officeParserConfig.setUseSAXDocxExtractor(true);
+        pc.set(OfficeParserConfig.class, officeParserConfig);
+        String xml = getXML("testWORD_text_box.docx", pc).xml;
+        assertContains("This text is directly in the body of the document.", xml);
+        assertNotContained("This text is inside of a text box in the body of the document.", xml);
+        assertNotContained("This text is inside of a text box in the header of the document.", xml);
+        assertNotContained("This text is inside of a text box in the footer of the document.", xml);
+    }
+
     /**
      * Test for missing text described in
      * <a href="https://issues.apache.org/jira/browse/TIKA-1130">TIKA-1130</a>.
diff --git a/tika-parsers/src/test/resources/test-documents/testEXCEL_textbox.xls b/tika-parsers/src/test/resources/test-documents/testEXCEL_textbox.xls
new file mode 100644
index 0000000..17e4b8a
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testEXCEL_textbox.xls differ

-- 
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].